diff --git a/.dockerignore b/.dockerignore new file mode 100644 index 00000000..fa1076d6 --- /dev/null +++ b/.dockerignore @@ -0,0 +1,8 @@ +main/src/tests +/docs +/.github +*~ +**/.ipynb_checkpoints/ +README.md +LICENSE +/.venv \ No newline at end of file diff --git a/.github/dependabot.yml b/.github/dependabot.yml new file mode 100644 index 00000000..5ea2decd --- /dev/null +++ b/.github/dependabot.yml @@ -0,0 +1,8 @@ +version: 2 +updates: + - package-ecosystem: "github-actions" + directory: "/" + target-branch: "hotfix" + schedule: + # Check for updates to GitHub Actions every weekday + interval: "daily" diff --git a/.github/workflows/container_build.yml b/.github/workflows/container_build.yml new file mode 100644 index 00000000..f24b7152 --- /dev/null +++ b/.github/workflows/container_build.yml @@ -0,0 +1,87 @@ +name: Container Build +on: + push: + tags: + - "*" + +env: + REGISTRY: ghcr.io + IMAGE_NAME: ${{ github.repository }} + DEBIAN_FRONTEND: noninteractive + +jobs: + build-and-push-image: + # IMPORTANT: Only run the publish job when a tag starting with 'v' is pushed + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/v') + + name: Build and Push Docker Image + runs-on: ubuntu-latest + permissions: + contents: read + packages: write + + # A matrix platform is used here to expand functionality for building for additional architectures + # For example, intel/amd (linux/amd64) or Apple Silicon (arm64) + strategy: + matrix: + platform: [ linux/amd64 ] + + steps: + # Checkout the repository + - name: Checkout Repository + uses: actions/checkout@v4 + with: + ref: ${{ github.ref }} + + # Get tag/release information for docker tags + - name: Docker Metadata + id: metadata + uses: docker/metadata-action@v5 + with: + flavor: | + latest=false + prefix= + suffix= + images: ${{ env.REGISTRY }}/${{ env.IMAGE_NAME }} + tags: | + type=semver,pattern={{version}} + type=raw,value=latest,enable=${{ startsWith(github.ref, 'refs/tags/') && endsWith(github.ref, 'master') }} + + - name: Set up Docker Buildx + id: buildx + uses: docker/setup-buildx-action@v3 + + - name: Set up QEMU + uses: docker/setup-qemu-action@v3 + with: + platforms: ${{ matrix.platform }} + + - name: Set up Jupyter Notebook Cleaner + uses: actions/setup-python@v5 + with: + python-version: "3.10" + + - name: Clean Jupyter Notebook Outputs + run: | + python -m pip install --upgrade nbstripout --no-cache-dir + find $GITHUB_WORKSPACE -name "*.ipynb" -exec nbstripout "{}" \; + + # Log into the GitHub Container Registry so we can push the image + - name: Log in to Container Registry + uses: docker/login-action@v3 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + + - name: Build and push + uses: docker/build-push-action@v6 + with: + context: . + push: true + file: ./Dockerfile + tags: ${{ steps.metadata.outputs.tags }} + labels: ${{ steps.metadata.outputs.labels }} + builder: ${{ steps.buildx.outputs.name }} + platforms: ${{ matrix.platform }} + diff --git a/.github/workflows/continuous_integration.yml b/.github/workflows/continuous_integration.yml new file mode 100644 index 00000000..286f54d0 --- /dev/null +++ b/.github/workflows/continuous_integration.yml @@ -0,0 +1,33 @@ +name: Continuous Integration +on: + push: + branches: + - main + pull_request: + workflow_dispatch: + +jobs: + test: + runs-on: ubuntu-latest + strategy: + matrix: + python-version: [ "3.11", "3.12", "3.13" ] + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install uv + uses: astral-sh/setup-uv@v5 + with: + enable-cache: "true" + cache-suffix: "${{ matrix.python-version }}" + cache-dependency-glob: 'uv.lock' + + - name: Install Dependencies + run: uv sync --python "${{ matrix.python-version }}" --all-extras --dev + + - name: Run tests + run: uv run --python "${{ matrix.python-version }}" pytest --cov --junitxml=junit.xml -o junit_family=legacy + + - name: Cache Clear + run: uv cache prune --ci diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..6123cff0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,24 @@ +/**/*.Rhistory +/**/.DS_Store +/**/.idea +/**/.ipynb_checkpoints/ +/**/.jekyll-cache +/**/.mypy_cache +/**/.pytest_cache +/**/__pycache__ + +# Documentation +docs/_site +docs/Gemfile.lock + +# COMO-specific files +.hypothesis +**/*.Rout +main/logs/ +main/data/gene_info.csv +main/data/Repurposing_Hub_Preproc.tsv +main/data/results/ +main/data/config_sheets + +# COMO data +main/data/data_matrices diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml new file mode 100644 index 00000000..bf67014b --- /dev/null +++ b/.pre-commit-config.yaml @@ -0,0 +1,6 @@ +repos: + - repo: https://github.com/commitizen-tools/commitizen + rev: master + hooks: + - id: commitizen + stages: [ commit-msg ] \ No newline at end of file diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 00000000..850064f2 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,12 @@ +FROM python:3.10 AS app + +WORKDIR /app +ENV PATH="/app/.venv/bin:$PATH" +COPY --from=ghcr.io/astral-sh/uv:latest /uv /uvx /bin/ +COPY --chown=1000:100 main /app/main/ +COPY --chown=1000:100 pyproject.toml /app/pyproject.toml + +RUN uv sync && uv pip install jupyterlab +EXPOSE 8888 +VOLUME "/app/main/data/local_files" +CMD ["jupyter", "lab", "--allow-root", "--no-browser", "--ip=0.0.0.0", "--port=8888", "--notebook-dir=/app/main", "--NotebookApp.token=''", "--NotebookApp.password=''"] diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..f288702d --- /dev/null +++ b/LICENSE @@ -0,0 +1,674 @@ + GNU GENERAL PUBLIC LICENSE + Version 3, 29 June 2007 + + Copyright (C) 2007 Free Software Foundation, Inc. + Everyone is permitted to copy and distribute verbatim copies + of this license document, but changing it is not allowed. + + Preamble + + The GNU General Public License is a free, copyleft license for +software and other kinds of works. + + The licenses for most software and other practical works are designed +to take away your freedom to share and change the works. By contrast, +the GNU General Public License is intended to guarantee your freedom to +share and change all versions of a program--to make sure it remains free +software for all its users. We, the Free Software Foundation, use the +GNU General Public License for most of our software; it applies also to +any other work released this way by its authors. You can apply it to +your programs, too. + + When we speak of free software, we are referring to freedom, not +price. Our General Public Licenses are designed to make sure that you +have the freedom to distribute copies of free software (and charge for +them if you wish), that you receive source code or can get it if you +want it, that you can change the software or use pieces of it in new +free programs, and that you know you can do these things. + + To protect your rights, we need to prevent others from denying you +these rights or asking you to surrender the rights. Therefore, you have +certain responsibilities if you distribute copies of the software, or if +you modify it: responsibilities to respect the freedom of others. + + For example, if you distribute copies of such a program, whether +gratis or for a fee, you must pass on to the recipients the same +freedoms that you received. You must make sure that they, too, receive +or can get the source code. And you must show them these terms so they +know their rights. + + Developers that use the GNU GPL protect your rights with two steps: +(1) assert copyright on the software, and (2) offer you this License +giving you legal permission to copy, distribute and/or modify it. + + For the developers' and authors' protection, the GPL clearly explains +that there is no warranty for this free software. For both users' and +authors' sake, the GPL requires that modified versions be marked as +changed, so that their problems will not be attributed erroneously to +authors of previous versions. + + Some devices are designed to deny users access to install or run +modified versions of the software inside them, although the manufacturer +can do so. This is fundamentally incompatible with the aim of +protecting users' freedom to change the software. The systematic +pattern of such abuse occurs in the area of products for individuals to +use, which is precisely where it is most unacceptable. Therefore, we +have designed this version of the GPL to prohibit the practice for those +products. If such problems arise substantially in other domains, we +stand ready to extend this provision to those domains in future versions +of the GPL, as needed to protect the freedom of users. + + Finally, every program is threatened constantly by software patents. +States should not allow patents to restrict development and use of +software on general-purpose computers, but in those that do, we wish to +avoid the special danger that patents applied to a free program could +make it effectively proprietary. To prevent this, the GPL assures that +patents cannot be used to render the program non-free. + + The precise terms and conditions for copying, distribution and +modification follow. + + TERMS AND CONDITIONS + + 0. Definitions. + + "This License" refers to version 3 of the GNU General Public License. + + "Copyright" also means copyright-like laws that apply to other kinds of +works, such as semiconductor masks. + + "The Program" refers to any copyrightable work licensed under this +License. Each licensee is addressed as "you". "Licensees" and +"recipients" may be individuals or organizations. + + To "modify" a work means to copy from or adapt all or part of the work +in a fashion requiring copyright permission, other than the making of an +exact copy. The resulting work is called a "modified version" of the +earlier work or a work "based on" the earlier work. + + A "covered work" means either the unmodified Program or a work based +on the Program. + + To "propagate" a work means to do anything with it that, without +permission, would make you directly or secondarily liable for +infringement under applicable copyright law, except executing it on a +computer or modifying a private copy. Propagation includes copying, +distribution (with or without modification), making available to the +public, and in some countries other activities as well. + + To "convey" a work means any kind of propagation that enables other +parties to make or receive copies. Mere interaction with a user through +a computer network, with no transfer of a copy, is not conveying. + + An interactive user interface displays "Appropriate Legal Notices" +to the extent that it includes a convenient and prominently visible +feature that (1) displays an appropriate copyright notice, and (2) +tells the user that there is no warranty for the work (except to the +extent that warranties are provided), that licensees may convey the +work under this License, and how to view a copy of this License. If +the interface presents a list of user commands or options, such as a +menu, a prominent item in the list meets this criterion. + + 1. Source Code. + + The "source code" for a work means the preferred form of the work +for making modifications to it. "Object code" means any non-source +form of a work. + + A "Standard Interface" means an interface that either is an official +standard defined by a recognized standards body, or, in the case of +interfaces specified for a particular programming language, one that +is widely used among developers working in that language. + + The "System Libraries" of an executable work include anything, other +than the work as a whole, that (a) is included in the normal form of +packaging a Major Component, but which is not part of that Major +Component, and (b) serves only to enable use of the work with that +Major Component, or to implement a Standard Interface for which an +implementation is available to the public in source code form. A +"Major Component", in this context, means a major essential component +(kernel, window system, and so on) of the specific operating system +(if any) on which the executable work runs, or a compiler used to +produce the work, or an object code interpreter used to run it. + + The "Corresponding Source" for a work in object code form means all +the source code needed to generate, install, and (for an executable +work) run the object code and to modify the work, including scripts to +control those activities. However, it does not include the work's +System Libraries, or general-purpose tools or generally available free +programs which are used unmodified in performing those activities but +which are not part of the work. For example, Corresponding Source +includes interface definition files associated with source files for +the work, and the source code for shared libraries and dynamically +linked subprograms that the work is specifically designed to require, +such as by intimate data communication or control flow between those +subprograms and other parts of the work. + + The Corresponding Source need not include anything that users +can regenerate automatically from other parts of the Corresponding +Source. + + The Corresponding Source for a work in source code form is that +same work. + + 2. Basic Permissions. + + All rights granted under this License are granted for the term of +copyright on the Program, and are irrevocable provided the stated +conditions are met. This License explicitly affirms your unlimited +permission to run the unmodified Program. The output from running a +covered work is covered by this License only if the output, given its +content, constitutes a covered work. This License acknowledges your +rights of fair use or other equivalent, as provided by copyright law. + + You may make, run and propagate covered works that you do not +convey, without conditions so long as your license otherwise remains +in force. You may convey covered works to others for the sole purpose +of having them make modifications exclusively for you, or provide you +with facilities for running those works, provided that you comply with +the terms of this License in conveying all material for which you do +not control copyright. Those thus making or running the covered works +for you must do so exclusively on your behalf, under your direction +and control, on terms that prohibit them from making any copies of +your copyrighted material outside their relationship with you. + + Conveying under any other circumstances is permitted solely under +the conditions stated below. Sublicensing is not allowed; section 10 +makes it unnecessary. + + 3. Protecting Users' Legal Rights From Anti-Circumvention Law. + + No covered work shall be deemed part of an effective technological +measure under any applicable law fulfilling obligations under article +11 of the WIPO copyright treaty adopted on 20 December 1996, or +similar laws prohibiting or restricting circumvention of such +measures. + + When you convey a covered work, you waive any legal power to forbid +circumvention of technological measures to the extent such circumvention +is effected by exercising rights under this License with respect to +the covered work, and you disclaim any intention to limit operation or +modification of the work as a means of enforcing, against the work's +users, your or third parties' legal rights to forbid circumvention of +technological measures. + + 4. Conveying Verbatim Copies. + + You may convey verbatim copies of the Program's source code as you +receive it, in any medium, provided that you conspicuously and +appropriately publish on each copy an appropriate copyright notice; +keep intact all notices stating that this License and any +non-permissive terms added in accord with section 7 apply to the code; +keep intact all notices of the absence of any warranty; and give all +recipients a copy of this License along with the Program. + + You may charge any price or no price for each copy that you convey, +and you may offer support or warranty protection for a fee. + + 5. Conveying Modified Source Versions. + + You may convey a work based on the Program, or the modifications to +produce it from the Program, in the form of source code under the +terms of section 4, provided that you also meet all of these conditions: + + a) The work must carry prominent notices stating that you modified + it, and giving a relevant date. + + b) The work must carry prominent notices stating that it is + released under this License and any conditions added under section + 7. This requirement modifies the requirement in section 4 to + "keep intact all notices". + + c) You must license the entire work, as a whole, under this + License to anyone who comes into possession of a copy. This + License will therefore apply, along with any applicable section 7 + additional terms, to the whole of the work, and all its parts, + regardless of how they are packaged. This License gives no + permission to license the work in any other way, but it does not + invalidate such permission if you have separately received it. + + d) If the work has interactive user interfaces, each must display + Appropriate Legal Notices; however, if the Program has interactive + interfaces that do not display Appropriate Legal Notices, your + work need not make them do so. + + A compilation of a covered work with other separate and independent +works, which are not by their nature extensions of the covered work, +and which are not combined with it such as to form a larger program, +in or on a volume of a storage or distribution medium, is called an +"aggregate" if the compilation and its resulting copyright are not +used to limit the access or legal rights of the compilation's users +beyond what the individual works permit. Inclusion of a covered work +in an aggregate does not cause this License to apply to the other +parts of the aggregate. + + 6. Conveying Non-Source Forms. + + You may convey a covered work in object code form under the terms +of sections 4 and 5, provided that you also convey the +machine-readable Corresponding Source under the terms of this License, +in one of these ways: + + a) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by the + Corresponding Source fixed on a durable physical medium + customarily used for software interchange. + + b) Convey the object code in, or embodied in, a physical product + (including a physical distribution medium), accompanied by a + written offer, valid for at least three years and valid for as + long as you offer spare parts or customer support for that product + model, to give anyone who possesses the object code either (1) a + copy of the Corresponding Source for all the software in the + product that is covered by this License, on a durable physical + medium customarily used for software interchange, for a price no + more than your reasonable cost of physically performing this + conveying of source, or (2) access to copy the + Corresponding Source from a network server at no charge. + + c) Convey individual copies of the object code with a copy of the + written offer to provide the Corresponding Source. This + alternative is allowed only occasionally and noncommercially, and + only if you received the object code with such an offer, in accord + with subsection 6b. + + d) Convey the object code by offering access from a designated + place (gratis or for a charge), and offer equivalent access to the + Corresponding Source in the same way through the same place at no + further charge. You need not require recipients to copy the + Corresponding Source along with the object code. If the place to + copy the object code is a network server, the Corresponding Source + may be on a different server (operated by you or a third party) + that supports equivalent copying facilities, provided you maintain + clear directions next to the object code saying where to find the + Corresponding Source. Regardless of what server hosts the + Corresponding Source, you remain obligated to ensure that it is + available for as long as needed to satisfy these requirements. + + e) Convey the object code using peer-to-peer transmission, provided + you inform other peers where the object code and Corresponding + Source of the work are being offered to the general public at no + charge under subsection 6d. + + A separable portion of the object code, whose source code is excluded +from the Corresponding Source as a System Library, need not be +included in conveying the object code work. + + A "User Product" is either (1) a "consumer product", which means any +tangible personal property which is normally used for personal, family, +or household purposes, or (2) anything designed or sold for incorporation +into a dwelling. In determining whether a product is a consumer product, +doubtful cases shall be resolved in favor of coverage. For a particular +product received by a particular user, "normally used" refers to a +typical or common use of that class of product, regardless of the status +of the particular user or of the way in which the particular user +actually uses, or expects or is expected to use, the product. A product +is a consumer product regardless of whether the product has substantial +commercial, industrial or non-consumer uses, unless such uses represent +the only significant mode of use of the product. + + "Installation Information" for a User Product means any methods, +procedures, authorization keys, or other information required to install +and execute modified versions of a covered work in that User Product from +a modified version of its Corresponding Source. The information must +suffice to ensure that the continued functioning of the modified object +code is in no case prevented or interfered with solely because +modification has been made. + + If you convey an object code work under this section in, or with, or +specifically for use in, a User Product, and the conveying occurs as +part of a transaction in which the right of possession and use of the +User Product is transferred to the recipient in perpetuity or for a +fixed term (regardless of how the transaction is characterized), the +Corresponding Source conveyed under this section must be accompanied +by the Installation Information. But this requirement does not apply +if neither you nor any third party retains the ability to install +modified object code on the User Product (for example, the work has +been installed in ROM). + + The requirement to provide Installation Information does not include a +requirement to continue to provide support service, warranty, or updates +for a work that has been modified or installed by the recipient, or for +the User Product in which it has been modified or installed. Access to a +network may be denied when the modification itself materially and +adversely affects the operation of the network or violates the rules and +protocols for communication across the network. + + Corresponding Source conveyed, and Installation Information provided, +in accord with this section must be in a format that is publicly +documented (and with an implementation available to the public in +source code form), and must require no special password or key for +unpacking, reading or copying. + + 7. Additional Terms. + + "Additional permissions" are terms that supplement the terms of this +License by making exceptions from one or more of its conditions. +Additional permissions that are applicable to the entire Program shall +be treated as though they were included in this License, to the extent +that they are valid under applicable law. If additional permissions +apply only to part of the Program, that part may be used separately +under those permissions, but the entire Program remains governed by +this License without regard to the additional permissions. + + When you convey a copy of a covered work, you may at your option +remove any additional permissions from that copy, or from any part of +it. (Additional permissions may be written to require their own +removal in certain cases when you modify the work.) You may place +additional permissions on material, added by you to a covered work, +for which you have or can give appropriate copyright permission. + + Notwithstanding any other provision of this License, for material you +add to a covered work, you may (if authorized by the copyright holders of +that material) supplement the terms of this License with terms: + + a) Disclaiming warranty or limiting liability differently from the + terms of sections 15 and 16 of this License; or + + b) Requiring preservation of specified reasonable legal notices or + author attributions in that material or in the Appropriate Legal + Notices displayed by works containing it; or + + c) Prohibiting misrepresentation of the origin of that material, or + requiring that modified versions of such material be marked in + reasonable ways as different from the original version; or + + d) Limiting the use for publicity purposes of names of licensors or + authors of the material; or + + e) Declining to grant rights under trademark law for use of some + trade names, trademarks, or service marks; or + + f) Requiring indemnification of licensors and authors of that + material by anyone who conveys the material (or modified versions of + it) with contractual assumptions of liability to the recipient, for + any liability that these contractual assumptions directly impose on + those licensors and authors. + + All other non-permissive additional terms are considered "further +restrictions" within the meaning of section 10. If the Program as you +received it, or any part of it, contains a notice stating that it is +governed by this License along with a term that is a further +restriction, you may remove that term. If a license document contains +a further restriction but permits relicensing or conveying under this +License, you may add to a covered work material governed by the terms +of that license document, provided that the further restriction does +not survive such relicensing or conveying. + + If you add terms to a covered work in accord with this section, you +must place, in the relevant source files, a statement of the +additional terms that apply to those files, or a notice indicating +where to find the applicable terms. + + Additional terms, permissive or non-permissive, may be stated in the +form of a separately written license, or stated as exceptions; +the above requirements apply either way. + + 8. Termination. + + You may not propagate or modify a covered work except as expressly +provided under this License. Any attempt otherwise to propagate or +modify it is void, and will automatically terminate your rights under +this License (including any patent licenses granted under the third +paragraph of section 11). + + However, if you cease all violation of this License, then your +license from a particular copyright holder is reinstated (a) +provisionally, unless and until the copyright holder explicitly and +finally terminates your license, and (b) permanently, if the copyright +holder fails to notify you of the violation by some reasonable means +prior to 60 days after the cessation. + + Moreover, your license from a particular copyright holder is +reinstated permanently if the copyright holder notifies you of the +violation by some reasonable means, this is the first time you have +received notice of violation of this License (for any work) from that +copyright holder, and you cure the violation prior to 30 days after +your receipt of the notice. + + Termination of your rights under this section does not terminate the +licenses of parties who have received copies or rights from you under +this License. If your rights have been terminated and not permanently +reinstated, you do not qualify to receive new licenses for the same +material under section 10. + + 9. Acceptance Not Required for Having Copies. + + You are not required to accept this License in order to receive or +run a copy of the Program. Ancillary propagation of a covered work +occurring solely as a consequence of using peer-to-peer transmission +to receive a copy likewise does not require acceptance. However, +nothing other than this License grants you permission to propagate or +modify any covered work. These actions infringe copyright if you do +not accept this License. Therefore, by modifying or propagating a +covered work, you indicate your acceptance of this License to do so. + + 10. Automatic Licensing of Downstream Recipients. + + Each time you convey a covered work, the recipient automatically +receives a license from the original licensors, to run, modify and +propagate that work, subject to this License. You are not responsible +for enforcing compliance by third parties with this License. + + An "entity transaction" is a transaction transferring control of an +organization, or substantially all assets of one, or subdividing an +organization, or merging organizations. If propagation of a covered +work results from an entity transaction, each party to that +transaction who receives a copy of the work also receives whatever +licenses to the work the party's predecessor in interest had or could +give under the previous paragraph, plus a right to possession of the +Corresponding Source of the work from the predecessor in interest, if +the predecessor has it or can get it with reasonable efforts. + + You may not impose any further restrictions on the exercise of the +rights granted or affirmed under this License. For example, you may +not impose a license fee, royalty, or other charge for exercise of +rights granted under this License, and you may not initiate litigation +(including a cross-claim or counterclaim in a lawsuit) alleging that +any patent claim is infringed by making, using, selling, offering for +sale, or importing the Program or any portion of it. + + 11. Patents. + + A "contributor" is a copyright holder who authorizes use under this +License of the Program or a work on which the Program is based. The +work thus licensed is called the contributor's "contributor version". + + A contributor's "essential patent claims" are all patent claims +owned or controlled by the contributor, whether already acquired or +hereafter acquired, that would be infringed by some manner, permitted +by this License, of making, using, or selling its contributor version, +but do not include claims that would be infringed only as a +consequence of further modification of the contributor version. For +purposes of this definition, "control" includes the right to grant +patent sublicenses in a manner consistent with the requirements of +this License. + + Each contributor grants you a non-exclusive, worldwide, royalty-free +patent license under the contributor's essential patent claims, to +make, use, sell, offer for sale, import and otherwise run, modify and +propagate the contents of its contributor version. + + In the following three paragraphs, a "patent license" is any express +agreement or commitment, however denominated, not to enforce a patent +(such as an express permission to practice a patent or covenant not to +sue for patent infringement). To "grant" such a patent license to a +party means to make such an agreement or commitment not to enforce a +patent against the party. + + If you convey a covered work, knowingly relying on a patent license, +and the Corresponding Source of the work is not available for anyone +to copy, free of charge and under the terms of this License, through a +publicly available network server or other readily accessible means, +then you must either (1) cause the Corresponding Source to be so +available, or (2) arrange to deprive yourself of the benefit of the +patent license for this particular work, or (3) arrange, in a manner +consistent with the requirements of this License, to extend the patent +license to downstream recipients. "Knowingly relying" means you have +actual knowledge that, but for the patent license, your conveying the +covered work in a country, or your recipient's use of the covered work +in a country, would infringe one or more identifiable patents in that +country that you have reason to believe are valid. + + If, pursuant to or in connection with a single transaction or +arrangement, you convey, or propagate by procuring conveyance of, a +covered work, and grant a patent license to some of the parties +receiving the covered work authorizing them to use, propagate, modify +or convey a specific copy of the covered work, then the patent license +you grant is automatically extended to all recipients of the covered +work and works based on it. + + A patent license is "discriminatory" if it does not include within +the scope of its coverage, prohibits the exercise of, or is +conditioned on the non-exercise of one or more of the rights that are +specifically granted under this License. You may not convey a covered +work if you are a party to an arrangement with a third party that is +in the business of distributing software, under which you make payment +to the third party based on the extent of your activity of conveying +the work, and under which the third party grants, to any of the +parties who would receive the covered work from you, a discriminatory +patent license (a) in connection with copies of the covered work +conveyed by you (or copies made from those copies), or (b) primarily +for and in connection with specific products or compilations that +contain the covered work, unless you entered into that arrangement, +or that patent license was granted, prior to 28 March 2007. + + Nothing in this License shall be construed as excluding or limiting +any implied license or other defenses to infringement that may +otherwise be available to you under applicable patent law. + + 12. No Surrender of Others' Freedom. + + If conditions are imposed on you (whether by court order, agreement or +otherwise) that contradict the conditions of this License, they do not +excuse you from the conditions of this License. If you cannot convey a +covered work so as to satisfy simultaneously your obligations under this +License and any other pertinent obligations, then as a consequence you may +not convey it at all. For example, if you agree to terms that obligate you +to collect a royalty for further conveying from those to whom you convey +the Program, the only way you could satisfy both those terms and this +License would be to refrain entirely from conveying the Program. + + 13. Use with the GNU Affero General Public License. + + Notwithstanding any other provision of this License, you have +permission to link or combine any covered work with a work licensed +under version 3 of the GNU Affero General Public License into a single +combined work, and to convey the resulting work. The terms of this +License will continue to apply to the part which is the covered work, +but the special requirements of the GNU Affero General Public License, +section 13, concerning interaction through a network will apply to the +combination as such. + + 14. Revised Versions of this License. + + The Free Software Foundation may publish revised and/or new versions of +the GNU General Public License from time to time. Such new versions will +be similar in spirit to the present version, but may differ in detail to +address new problems or concerns. + + Each version is given a distinguishing version number. If the +Program specifies that a certain numbered version of the GNU General +Public License "or any later version" applies to it, you have the +option of following the terms and conditions either of that numbered +version or of any later version published by the Free Software +Foundation. If the Program does not specify a version number of the +GNU General Public License, you may choose any version ever published +by the Free Software Foundation. + + If the Program specifies that a proxy can decide which future +versions of the GNU General Public License can be used, that proxy's +public statement of acceptance of a version permanently authorizes you +to choose that version for the Program. + + Later license versions may give you additional or different +permissions. However, no additional obligations are imposed on any +author or copyright holder as a result of your choosing to follow a +later version. + + 15. Disclaimer of Warranty. + + THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY +APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT +HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY +OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, +THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM +IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF +ALL NECESSARY SERVICING, REPAIR OR CORRECTION. + + 16. Limitation of Liability. + + IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS +THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY +GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE +USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF +DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD +PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS), +EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF +SUCH DAMAGES. + + 17. Interpretation of Sections 15 and 16. + + If the disclaimer of warranty and limitation of liability provided +above cannot be given local legal effect according to their terms, +reviewing courts shall apply local law that most closely approximates +an absolute waiver of all civil liability in connection with the +Program, unless a warranty or assumption of liability accompanies a +copy of the Program in return for a fee. + + END OF TERMS AND CONDITIONS + + How to Apply These Terms to Your New Programs + + If you develop a new program, and you want it to be of the greatest +possible use to the public, the best way to achieve this is to make it +free software which everyone can redistribute and change under these terms. + + To do so, attach the following notices to the program. It is safest +to attach them to the start of each source file to most effectively +state the exclusion of warranty; and each file should have at least +the "copyright" line and a pointer to where the full notice is found. + + + Copyright (C) + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . + +Also add information on how to contact you by electronic and paper mail. + + If the program does terminal interaction, make it output a short +notice like this when it starts in an interactive mode: + + Copyright (C) + This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'. + This is free software, and you are welcome to redistribute it + under certain conditions; type `show c' for details. + +The hypothetical commands `show w' and `show c' should show the appropriate +parts of the General Public License. Of course, your program's commands +might be different; for a GUI interface, you would use an "about box". + + You should also get your employer (if you work as a programmer) or school, +if any, to sign a "copyright disclaimer" for the program, if necessary. +For more information on this, and how to apply and follow the GNU GPL, see +. + + The GNU General Public License does not permit incorporating your program +into proprietary programs. If your program is a subroutine library, you +may consider it more useful to permit linking proprietary applications with +the library. If this is what you want to do, use the GNU Lesser General +Public License instead of this License. But first, please read +. diff --git a/README.md b/README.md index 1965e6ec..b00162bc 100644 --- a/README.md +++ b/README.md @@ -1,30 +1,74 @@ -# pipelines +# COMO: Constraint-based Optimization of Metabolic Objectives -This is the home page for pipeline project. +![GitHub release (with filter)](https://img.shields.io/github/v/release/HelikarLab/COMO?filter=v*-master&style=for-the-badge&color=blue) +![GitHub Workflow Status (with event)](https://img.shields.io/github/actions/workflow/status/HelikarLab/COMO/unit_tests.yml?style=for-the-badge&logo=pytest&logoColor=white&label=Tests) +![GitHub Workflow Status (with event)](https://img.shields.io/github/actions/workflow/status/HelikarLab/COMO/container_build.yml?style=for-the-badge&logo=docker&logoColor=white&label=Docker%20Build) +[![pre-commit](https://img.shields.io/badge/pre--commit-enabled-brightgreen?logo=pre-commit)](https://github.com/pre-commit/pre-commit) -## FlowCharts -![1](./doc/IMG_1.jpg) -![2](./doc/IMG_2.jpg) +# Setting up COMO -![3](./doc/IMG_3.jpg) +Go to COMO's documentation page for full installation and operation instructions or use one of the Quick Start options -![4](./doc/IMG_4.jpg) +## [COMO Documentation Page](https://helikarlab.github.io/COMO) -## Run Script -1. Download Data from https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE2770 -2. Run Affy_Script.r in the same folder of the extracted data files +## Quick Start Options -Raw data -RMAC Data +### Docker Quick Start + +This installation method **does** require docker + +- [Install Docker](https://docs.docker.com/get-docker/) +- Pull our latest container + - `docker pull ghcr.io/helikarlab/como:latest` +- Run the container + - `docker run -p 8888:8888 ghcr.io/helikarlab/como:latest` + +> **NOTE**: The defualt installation method here **does not** allow for saving your work or utilizing +> the [Gurobi solver](https://www.gurobi.com/). If you would like either (or both) of these features, +> please [visit our documentation](https://helikarlab.github.io/COMO) for more details + +### Conda Quick Start + +This installation method does **not** require docker + +- [Install Conda](https://conda.io/projects/conda/en/latest/user-guide/install/index.html) + - Preferably, [install mamba](https://mamba.readthedocs.io/en/latest/mamba-installation.html#mamba-install) instead. + Mamba is much faster than Conda and offers the same features +- [Clone this repository](https://docs.github.com/en/repositories/creating-and-managing-repositories/cloning-a-repository) + - `git clone https://github.com/HelikarLab/COMO.git` +- Change directories into the newly cloned repository + - `cd COMO` +- Create a new conda environment + - `conda env create -f environment.yaml`, **OR** + - `mamba env create -f environment.yaml` +- Activate the new environment + - `conda activate como`, **OR** + - `mamba activate como` +- **IMPORTANT**: Install our modified version of zFPKM to allow for filtering insignificant local maxima during RNA-seq + processing + - `R -e "devtools::install_github('babessell1/zFPKM')"` +- Start the notebook server + - `cd main && jupyter notebook` (for "retro" jupyter notebook look and feel), **OR** + - `cd main && jupyter lab` (for the newer jupyter lab look and feel) + +This will open a web browser with the Jupyter Notebook/Lab interface. From here, you can open the `COMO.ipynb` notebook +to get started + +> **NOTE**: This installation method **will** allow for saving your work and utilizing +> the [Gurobi solver](https://www.gurobi.com/). If you would still like more details about this installation method, +> please [visit our documentation](https://helikarlab.github.io/COMO) + +## Flow Charts + +Please [follow this link](https://helikarlab.github.io/COMO/como_flowcharts.html) for flow charts ## Resources -* https://opencobra.github.io/ -* cobrapy [doc](https://cobrapy.readthedocs.io/en/stable/), [installation](https://github.com/opencobra/cobrapy/blob/master/INSTALL.rst) -* Limma R package [User Guide](https://www.bioconductor.org/packages/release/bioc/vignettes/limma/inst/doc/usersguide.pdf), [installation](https://bioconductor.org/packages/release/bioc/html/limma.html) -* [affy—analysis of Affymetrix GeneChip data at the probe level](./papers/btg405.pdf) -* [R project](https://www.r-project.org/), [Installation of R, R Studio, R Commander](https://www.andrewheiss.com/blog/2012/04/17/install-r-rstudio-r-commander-windows-osx/) -* [rpy2](https://rpy2.readthedocs.io) -* [biopython](https://biopython.org/wiki/Packages) -* Python: [GEOparse](https://geoparse.readthedocs.io/), R: [GEOquery](https://bioconductor.org/packages/release/bioc/html/GEOquery.html) -* [R Tutorial](https://www.cyclismo.org/tutorial/R/index.html) + +Resources for packages used in COMO and other useful links, please +see [here](https://helikarlab.github.io/COMO/como_resources.html) + +## Citation +If you use this work, please cite it with the following + +Brandt Bessell, Josh Loecker, Zhongyuan Zhao, Sara Sadat Aghamiri, Sabyasachi Mohanty, Rada Amin, Tomáš Helikar, Bhanwar Lal Puniya, COMO: a pipeline for multi-omics data integration in metabolic modeling and drug discovery, Briefings in Bioinformatics, Volume 24, Issue 6, November 2023, bbad387, https://doi.org/10.1093/bib/bbad387 diff --git a/code/Affy_Script.r b/code/Affy_Script.r deleted file mode 100755 index 66a30767..00000000 --- a/code/Affy_Script.r +++ /dev/null @@ -1,55 +0,0 @@ -# only need to run block 1-2 -# block 0 -#setwd("~/OneDrive - University of Nebraska-Lincoln/BIOC439_TermProject/New") -entrez570 = read.delim(file = "gpl570entrez.csv", sep = ",") -entrez96 = read.delim(file = "gpl96entrez.csv", sep = ",") -entrez97 = read.delim(file = "gpl97entrez.csv", sep = ",") -entrez4685 = read.delim(file = "gpl4685entrez.csv", sep = ",") -entrez8300 = read.delim(file = "gpl8300entrez.csv", sep = ",") - -# block 1 -#setwd("~/OneDrive - University of Nebraska-Lincoln/BIOC439_TermProject/New/Control/gpl570") -library(affy) -mydata = ReadAffy() -eset = mas5(mydata) -eset_PMA <- mas5calls(mydata) -y <- data.frame(exprs(eset), exprs(eset_PMA), assayDataElement(eset_PMA, "se.exprs")) -y <- y[,sort(names(y))] -y[,25] = rownames(y) -string = colnames(y) -string[25] = "ID" -colnames(y) = string -y_entrez = merge(y, entrez570, by = "ID", all = TRUE) -write.table(y, file="mydata_PMA.xls", quote=F, col.names = NA, sep="\t") -write.table(y_entrez, file="mydata_PMA_entrez.xls", quote=F, sep="\t", col.names = NA) - -x = data.frame(exprs(eset)) - -x[,35] = rownames(x) -string = colnames(x) -string[35] = "ID" -colnames(x) = string -MERGE = merge(x, entrez96, by = "ID", all = TRUE) -write.table(MERGE, file = "MERGE.csv", sep = ",") - -# block 2 -# organize columns and remove expression data with no entrez ID -#NEW = read.delim(file = "MERGE.csv", sep = ",") -NEW = read.delim(file = "MERGE.csv", sep = ",") -entrez = NEW[,1] -NEW = NEW[,-1] - -# block 3 -# calculate z-score -NEW_z = scale(log2(NEW)) -rownames(NEW_z) = entrez -write.table(NEW_z, file = "data_z.csv", sep = ",") - -# boxplot -boxplot_labels = colnames(NEW_z) -boxplot = boxplot.matrix(NEW_z, use.cols = TRUE, outline = TRUE, names = boxplot_labels, main = "GSE2770 (96) Data", xlab = "Samples", ylab = "Normalized Expression Value", col=(c("purple"))) - -# remove duplicates after sorting highest to lowest average expression -NEW_z = read.delim(file = "data_z.csv", sep = ",") -NEW_z = NEW_z[!duplicated(NEW_z[,1]),] -write.table(NEW_z, file = "data_z_noduplicates.csv", sep = ",", row.names = FALSE) diff --git a/code/Duplicated_cmapFiles_Sript.R b/code/Duplicated_cmapFiles_Sript.R deleted file mode 100755 index fb51db44..00000000 --- a/code/Duplicated_cmapFiles_Sript.R +++ /dev/null @@ -1,9 +0,0 @@ -tx1=NULL; -for (i in 1:190) { - -tx=tmp1[[i]][1] -tx1= append(tx1,tx) -} - - -which(duplicated(tx1)==TRUE) diff --git a/code/GEO_ID_maps.r b/code/GEO_ID_maps.r deleted file mode 100644 index e3ff6b3c..00000000 --- a/code/GEO_ID_maps.r +++ /dev/null @@ -1,43 +0,0 @@ -#!/usr/bin/env Rscript -prjwd <- '~/Dropbox/Helikar/pipelines/data/' -setwd(prjwd) -library(GEOquery) -gpl96 <- getGEO('GPL96', destdir=".") -gpl97 <- getGEO('GPL97', destdir=".") -gpl570 <- getGEO('GPL570', destdir=".") -gpl4685 <- getGEO('GPL4685', destdir=".") -gpl8300 <- getGEO('GPL8300', destdir=".") - -entrez_id_96 = gpl96@dataTable@table$ENTREZ_GENE_ID -entrez_id_97 = gpl97@dataTable@table$ENTREZ_GENE_ID -entrez_id_570 = gpl570@dataTable@table$ENTREZ_GENE_ID -#entrez_id_4685 = gpl4685@dataTable@table$ENTREZ_GENE_ID -entrez_id_8300 = gpl8300@dataTable@table$ENTREZ_GENE_ID -id_96 = gpl96@dataTable@table$ID -id_97 = gpl97@dataTable@table$ID -id_570 = gpl570@dataTable@table$ID -#id_4685 = gpl4685@dataTable@table$ID -id_8300 = gpl8300@dataTable@table$ID - -entre96 = data.frame(id_96,entrez_id_96) -names(entre96) <- c('ID', 'ENTREZ_GENE_ID') -entre97 = data.frame(id_97,entrez_id_97) -names(entre97) <- c('ID', 'ENTREZ_GENE_ID') -entre570 = data.frame(id_570,entrez_id_570) -names(entre570) <- c('ID', 'ENTREZ_GENE_ID') -#entre4685 = data.frame(id_4685,entrez_id_4685) -#names(entre4685) <- c('ID', 'ENTREZ_GENE_ID') -entre8300 = data.frame(id_8300,entrez_id_8300) -names(entre8300) <- c('ID', 'ENTREZ_GENE_ID') - -write.table(entre96, file = "gpl96entrez.csv", row.names = FALSE, quote = FALSE, sep = ",") -write.table(entre97, file = "gpl97entrez.csv", row.names = FALSE, quote = FALSE, sep = ",") -write.table(entre570, file = "gpl570entrez.csv", row.names = FALSE, quote = FALSE, sep = ",") -#write.table(entre4685, file = "gpl4685entrez.csv", row.names = FALSE, quote = FALSE, sep = ",") -write.table(entre8300, file = "gpl8300entrez.csv", row.names = FALSE, quote = FALSE, sep = ",") - -#entrez570 = write.delim(file = "gpl570entrez.csv", sep = ",") -#entrez96 = write.delim(file = "gpl96entrez.csv", sep = ",") -#entrez97 = read.delim(file = "gpl97entrez.csv", sep = ",") -#entrez4685 = read.delim(file = "gpl4685entrez.csv", sep = ",") -#entrez8300 = read.delim(file = "gpl8300entrez.csv", sep = ",") diff --git a/code/Knock_out_simulation_1.m b/code/Knock_out_simulation_1.m deleted file mode 100755 index 4ca76933..00000000 --- a/code/Knock_out_simulation_1.m +++ /dev/null @@ -1,91 +0,0 @@ -%import known dt genes (Entrez) -%DT=tdfread('Targets_for_inhibitorsEntrez.txt') -%DT_genes=num2str(DT.Target_ofInhibitos_ENTREZ) -%DT_genes=cellstr(DT_genes) - -fid = fopen('Th1_inhibitors_Entrez.txt'); % change filename -DT_genes = textscan(fid,'%s','Delimiter','\n'); -DT_genes = DT_genes{1,1}; -DT_model=intersect(model.genes,DT_genes); - -% reactions of drug target genes -%reaction indices and gene indices from model.rxnGeneMat -[rxnInd, geneInd] = find(model.rxnGeneMat); -% geneInd to gene -geneInd2genes=model.genes(geneInd); -%gtoKD=intersect(DT_model,geneInd2genes) - -% simulate WT and gene deletion by perturbing DT_model -WT_sol=optimizeCbModel(model) -WT_sol=WT_sol.x -[grRatio,grRateKO,grRateWT,hasEffect,delRxns] = singleGeneDeletion(model,'MOMA',DT_model) -hasEffect_DT=find(hasEffect==1) -hasEffect_DTgenes=DT_model(hasEffect_DT) -[grRatio,grRateKO,grRateWT,hasEffect,delRxns,fluxSolution] = singleGeneDeletion(model,'MOMA',hasEffect_DTgenes) - -% flux ratio matrix - -fluxSolutionRatios=[]; -for i= 1:size(fluxSolution,2) -FSratios=fluxSolution(:,i)./WT_sol; -fluxSolutionRatios(:,i)=FSratios -end - -% Read files of up- and down-regulated genes from DAG_genes.txt -%%%%DAG_dis=tdfread('UP_DOWN_DAG.txt') -% Search gene indices for up down genes in diseases -%%DAG_dis_genes=DAG_dis.Gene -%%%DAG_dis_genes= cellstr(DAG_dis_genes) -%%%DAG_dis_genes=strtrim(DAG_dis_genes) -%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% - -fid = fopen('RA_DOWN.txt'); % change filename, run this for both up and down regulated genes -DAG_dis_genes = textscan(fid,'%s','Delimiter','\n'); -DAG_dis_genes = DAG_dis_genes{1,1}; - -% intersect with geneInd2gene to obtain DAG genes within the model -DAG_dis_met_genes=intersect(DAG_dis_genes,geneInd2genes) - -% Obtain reactions associated with DAG_dis_met_genes -% first obtain indices from geneIndtogenes -DAG_dis_met_genesInd=find(ismember(geneInd2genes,DAG_dis_met_genes)) -DAG_dis_met_rxnInd = rxnInd(DAG_dis_met_genesInd) - -% obtain DAG reactions flux ratios -DAG_rxn_fluxRatio = fluxSolutionRatios(DAG_dis_met_rxnInd,:) - -% -combined_output={geneInd2genes(DAG_dis_met_genesInd), DAG_dis_met_rxnInd, DAG_rxn_fluxRatio} - -% create network of genes based on the change - -gene_mat_out=[]; -for i = 1:length(hasEffect_DTgenes) -FR_i=DAG_rxn_fluxRatio(:,i) -naN_ind= find(isnan(DAG_rxn_fluxRatio(:,i))) -FR_i([naN_ind])=[]; -Gene_i=combined_output{1} -Gene_i([naN_ind])=[]; -Rind_i=combined_output{2} -Rind_i([naN_ind])=[]; - -P_gene=repelem(hasEffect_DTgenes(i),length(Gene_i)) -P_gene=P_gene' -pegene_mat=[P_gene Gene_i num2cell(FR_i)] -gene_mat_out=[gene_mat_out; pegene_mat] -end -% Change_file_name here -T = cell2table(gene_mat_out); -writetable(T,'Gene_Pairs_Inhi_Fratio_DOWN.txt') - - -% get Flux distribution for Rxns of genes associated with Up-regulated genes- score ((total number of flux down/total numebr of fluxes)-total number of upregulated fluxes/total number of fluxes) -% get the flux distribution for reactions associated with down-regulated genes- score 1 if it upregulates the down-regulated gene (total number of down-regulated fluxes/total fluxes-otal number of down-regulated fluxes/total flux) -% get the growth for each knock-out phenotype - score 1 if it not drastically changes the growth -% consider toxicity - score 1 if it is not changes housekeeping genes -% if it not changes the essential genes - -% sort scores -% pick highest one -% - - diff --git a/code/SCORE.R b/code/SCORE.R deleted file mode 100755 index 6de44dbe..00000000 --- a/code/SCORE.R +++ /dev/null @@ -1,50 +0,0 @@ - -d_score=NULL; - -p_model_genes=unique(data_out2[,1]) - -for (i in 1:length(p_model_genes)) { - -ind1=which(data_out2[,1]==p_model_genes[i]) -dat1=data_out2[ind1,] -total_aff=length(unique(dat1[,2])) -n_aff_down=which(dat1[,3]<0.99) -n_aff_down=length(unique(dat1[n_aff_down,2])) -n_aff_up=which(dat1[,3]=="Inf" | as.numeric(dat1[,3])>1) -n_aff_up=length(unique(dat1[n_aff_up,2])) -d_s=((n_aff_down - n_aff_up)/total_aff) - -d_s=paste(p_model_genes[i], d_s) -d_score=append(d_score, d_s) -} - -write.table(d_score, "d_score_UP.txt") - -##################### if above dont run ################## -d_score=NULL; - -p_model_genes=unique(data_out2[,1]) - -for (i in 1:length(p_model_genes)) { - -ind1=which(data_out2[,1]==p_model_genes[i]) -dat1=data_out2[ind1,] - if (class(dat1) == "character"){ - dat1=as.matrix(dat1) - dat1=t(dat1) - } else {} - -total_aff=length(unique(dat1[,2])) -n_aff_down=which(dat1[,3]<0.99) -n_aff_down=length(unique(dat1[n_aff_down,2])) -n_aff_up=which(dat1[,3]=="Inf" | as.numeric(dat1[,3])>1) -n_aff_up=length(unique(dat1[n_aff_up,2])) -d_s=((n_aff_down - n_aff_up)/total_aff) - -d_s=paste(p_model_genes[i], d_s) -d_score=append(d_score, d_s) -} - -write.table(d_score, "d_score_UP.txt") - - diff --git a/code/SCORE_DOWN.R b/code/SCORE_DOWN.R deleted file mode 100755 index 239ff422..00000000 --- a/code/SCORE_DOWN.R +++ /dev/null @@ -1,52 +0,0 @@ -d_score1=NULL; - -p_model_genes=unique(data_out2[,1]) - -for (i in 1:length(p_model_genes)) { - -ind1=which(data_out2[,1]==p_model_genes[i]) -dat1=data_out2[ind1,] -total_aff=length(unique(dat1[,2])) -n_aff_down=which(dat1[,3]<0.99) -n_aff_down=length(unique(dat1[n_aff_down,2])) -n_aff_up=which(dat1[,3]=="Inf" | as.numeric(dat1[,3])>1) -n_aff_up=length(unique(dat1[n_aff_up,2])) -#d_s=((n_aff_up - n_aff_down)/total_aff) -#d_s=(n_aff_up - n_aff_down) -d_s=(n_aff_up/total_aff) -d_s=paste(p_model_genes[i], d_s) -d_score1=append(d_score1, d_s) -} - -write.table(d_score1, "d_scoreDOWN.txt") - - - -############## If above wont run ################## - -d_score1=NULL; - -p_model_genes=unique(data_out2[,1]) - -for (i in 1:length(p_model_genes)) { - -ind1=which(data_out2[,1]==p_model_genes[i]) -dat1=data_out2[ind1,] - - if (class(dat1) == "character"){ - dat1=as.matrix(dat1) - dat1=t(dat1) - } else {} -total_aff=length(unique(dat1[,2])) -n_aff_down=which(dat1[,3]<0.99) -n_aff_down=length(unique(dat1[n_aff_down,2])) -n_aff_up=which(dat1[,3]=="Inf" | as.numeric(dat1[,3])>1) -n_aff_up=length(unique(dat1[n_aff_up,2])) -#d_s=((n_aff_up - n_aff_down)/total_aff) -#d_s=(n_aff_up - n_aff_down) -d_s=(n_aff_up/total_aff) -d_s=paste(p_model_genes[i], d_s) -d_score1=append(d_score1, d_s) -} - -write.table(d_score1, "d_scoreDOWN.txt") diff --git a/code/SCRIPT_ExtractingFRatio_DE_genes_2.R b/code/SCRIPT_ExtractingFRatio_DE_genes_2.R deleted file mode 100755 index a5ea52f0..00000000 --- a/code/SCRIPT_ExtractingFRatio_DE_genes_2.R +++ /dev/null @@ -1,38 +0,0 @@ -###################################################################################################################################################### -###### This script will work on the output of previous (that should be run one time only) ####################################################################################################################################### -# Compare with perturbed profile -# Read matlab generated file into R (output of Knock_out_simulation.m) -# To run below chunks directly use the CMap_similarity_data_Entrez.RData - -MN_gene_pairs=read.table("Gene_Pairs_Inhi_Fratio_DOWN.txt", sep=",",header=T) - -# Extract connectivtyMap_score for MN_gene_Pairs -data_out2={}; -for( i in 1:nrow(MN_gene_pairs)){ -c1_c4=which(MN_gene_pairs[i,1]==data_out[,5]) -c2_c5=which(MN_gene_pairs[i,2]==data_out[,6]) -matched_ind=intersect(c1_c4,c2_c5) - -if(length(matched_ind>0)) { -matched_d = data_out[matched_ind,] - if(class(matched_d)=="matrix"){ - matched_kd=which(matched_d[,4]=="kd") - matched_d=matched_d[matched_kd,] - comb_row=cbind(as.matrix(MN_gene_pairs[i,]),t(as.data.frame(matched_d))) - }else { - comb_row=cbind(as.matrix(MN_gene_pairs[i,]),t(as.data.frame(matched_d))) - - } - - -} else { -comb_row=cbind(as.matrix(MN_gene_pairs[i,]),t(as.data.frame(rep("NA",6)))) -} - -data_out2=rbind(data_out2,comb_row) -} - - -write.table(data_out2, "MN_gene_pairs_CMap_ScoresDOWN.txt") - -save.image("MN_gene_pairs_CMap_ScoresDOWN.RData") diff --git a/code/Script_CMap_FCMat_data.R b/code/Script_CMap_FCMat_data.R deleted file mode 100755 index f93a6c4b..00000000 --- a/code/Script_CMap_FCMat_data.R +++ /dev/null @@ -1,30 +0,0 @@ -############################ This script is to process similarity scores for multiple genes downloaded from CMap database ################################################### -# read connectivityMap touchstone profiles and convert them into single data matrix -fnames=dir() -data_comb={}; -for (i in 1:length(fnames)){ - -pert_gene_name=strsplit(fnames[i],'_') -pert_gene_name=pert_gene_name[[1]][1] - -data=read.delim(fnames[i],header=T) - -pert_gene_col=rep(pert_gene_name,length(data$Name)) -data_mat=cbind(pert_gene_col,as.character(data$Name), data$Score, as.character(data$Type)) - -data_comb=rbind(data_comb, data_mat) -} - -# Change gene names to Entrez_IDs -library('org.Hs.eg.db') - -Entrez_col1=mapIds(org.Hs.eg.db, data_comb[,1], 'ENTREZID', 'SYMBOL') -Entrez_col2=mapIds(org.Hs.eg.db, data_comb[,2], 'ENTREZID', 'SYMBOL') - - - -data_out=cbind(data_comb,Entrez_col1,Entrez_col2) -#save(data_comb, data_out, file="CMap_similarity_data_Entrez.RData") - - - diff --git a/code/Script_DT.R b/code/Script_DT.R deleted file mode 100755 index 817bdcc4..00000000 --- a/code/Script_DT.R +++ /dev/null @@ -1,54 +0,0 @@ -# Read connectivity map file that has withdrawn drugs removed -con_map_dt=read.csv("Conmap_DT.csv",header=T) -# Read Symbol vs Entrez IDs file retreived form orthoretreiver -Sym2Entrez=read.table("Symbol_Entrez_Conmap_data.txt",header=T) -# Split Drug target symbols - -Target_char=as.character(con_map_dt$Target) -spl_dt=list(); -for (i in 1:length(Target_char)) # -{ -x=strsplit(Target_char[i],split=", ") -spl_dt=append(spl_dt,x) -} - -#### Drugs for each target SYMBOL -SymChar=as.character(Sym2Entrez$Symbol) - -DT_Symb=NULL; -for(i in 1:length(spl_dt)) #length(spl_dt) -{ - if(length(spl_dt[[i]])>0) - { - for(j in 1:length(spl_dt[[i]])) - { - x=which((spl_dt[[i]][j]==SymChar)) - DT_Symb1=cbind(SymChar[x], con_map_dt[i,1:4]) - DT_Symb=rbind(DT_Symb,DT_Symb1) - } - } -} - -#### Addind entrez for each row of symbol -DT_Entrez=NULL; -#DT_symbChar=as.character(DT_Symb[,1]) -for(i in 1:nrow(DT_Symb)) # -{ - x=which(as.character(DT_Symb[i,1])== as.character(Sym2Entrez[,1])) - DT_Entrez1= Sym2Entrez[x,2] - DT_Entrez = append(DT_Entrez,DT_Entrez1) -} - -#Combine DT_Symb (Drug info per symbol) with DT_Entrez (Entrez for each Symbol DT) - -DT_Symb_Entrez=cbind(DT_Entrez,DT_Symb) -write.table(DT_Symb_Entrez, "DRUGS_DATA_ENTREZWISE.txt", sep="\t") - -##end of the script - - - - - - - diff --git a/code/Script_EntrezWise_DrugsConMap.R b/code/Script_EntrezWise_DrugsConMap.R deleted file mode 100755 index 817bdcc4..00000000 --- a/code/Script_EntrezWise_DrugsConMap.R +++ /dev/null @@ -1,54 +0,0 @@ -# Read connectivity map file that has withdrawn drugs removed -con_map_dt=read.csv("Conmap_DT.csv",header=T) -# Read Symbol vs Entrez IDs file retreived form orthoretreiver -Sym2Entrez=read.table("Symbol_Entrez_Conmap_data.txt",header=T) -# Split Drug target symbols - -Target_char=as.character(con_map_dt$Target) -spl_dt=list(); -for (i in 1:length(Target_char)) # -{ -x=strsplit(Target_char[i],split=", ") -spl_dt=append(spl_dt,x) -} - -#### Drugs for each target SYMBOL -SymChar=as.character(Sym2Entrez$Symbol) - -DT_Symb=NULL; -for(i in 1:length(spl_dt)) #length(spl_dt) -{ - if(length(spl_dt[[i]])>0) - { - for(j in 1:length(spl_dt[[i]])) - { - x=which((spl_dt[[i]][j]==SymChar)) - DT_Symb1=cbind(SymChar[x], con_map_dt[i,1:4]) - DT_Symb=rbind(DT_Symb,DT_Symb1) - } - } -} - -#### Addind entrez for each row of symbol -DT_Entrez=NULL; -#DT_symbChar=as.character(DT_Symb[,1]) -for(i in 1:nrow(DT_Symb)) # -{ - x=which(as.character(DT_Symb[i,1])== as.character(Sym2Entrez[,1])) - DT_Entrez1= Sym2Entrez[x,2] - DT_Entrez = append(DT_Entrez,DT_Entrez1) -} - -#Combine DT_Symb (Drug info per symbol) with DT_Entrez (Entrez for each Symbol DT) - -DT_Symb_Entrez=cbind(DT_Entrez,DT_Symb) -write.table(DT_Symb_Entrez, "DRUGS_DATA_ENTREZWISE.txt", sep="\t") - -##end of the script - - - - - - - diff --git a/code/Script_Replicating_multiEntrezIDs.r b/code/Script_Replicating_multiEntrezIDs.r deleted file mode 100755 index 1032fd2f..00000000 --- a/code/Script_Replicating_multiEntrezIDs.r +++ /dev/null @@ -1,52 +0,0 @@ -########################################################################## - -###Read data from the text file (Change file path here). Use "read.csv" if data is in CSV format## - -data=read.table("C:/Users/BHANWAR/Desktop/TEST12.txt",header=TRUE) - -###Split Entrz IDs based on the the underscore (as given in files uploaded by Bailee) ## - -nsplits=strsplit(as.character(data[,1]),"_") - -### Replicate the multiEntrez rows and Add new names in the first column## - -data3=NULL; -for(i in 1:nrow(data)){ -if(length(nsplits[[i]])>1){ -data2=data[rep(rownames(data)[i], length(nsplits[[i]])), ] -#print(data2) -data2[,1]=as.character(data2[,1]) -for(j in 1:nrow(data2)){ -data2[j,1]=nsplits[[i]][j] -} -data3=rbind(data3,data2) -} -} - - -## Remove rows with multiple Entrez IDs from the original data frame (create reduces dataframe)## - -tt1=NULL; -for(i in 1:nrow(data)) -{ - -tt=length(nsplits[[i]]) -tt1=append(tt1,tt) - -} - -tt2=which(tt1>1) - - -data4=data[-tt2,] - -## Create final dataframe (Combine replicated dataframe with reduced dataframe)## - -data5= rbind(data4,data3) - - -write.table(data5, "data_gse_replicated_entrez.txt", sep="\t") - - - - diff --git a/code/Script_merging_data.txt b/code/Script_merging_data.txt deleted file mode 100755 index cf9cd21a..00000000 --- a/code/Script_merging_data.txt +++ /dev/null @@ -1,92 +0,0 @@ - - -############################################################# -# Remove duplicates before merging and after merging ########### -###################################################################### - - -test11=read.csv(dir()[6],header=T) - -sum1=rowSums(test11[,-1]) - -test13=cbind(test11,sum1) -test14=test13[rev(order(test13$sum1)),] - - -#test15=unique(test14, by = "ENTREZ_GENE_ID") - -test16=test14[!duplicated(test14$ENTREZ_GENE_ID),] - -##################################################################### -# Use other script for replicating Entrez IDs - -data= test16 - -nsplits=strsplit(as.character(data[,1])," /// ") - -### Replicate the multiEntrez rows and Add new names in the first column## - -data3=NULL; -for(i in 1:nrow(data)){ -if(length(nsplits[[i]])>1){ -data2=data[rep(rownames(data)[i], length(nsplits[[i]])), ] -#print(data2) -data2[,1]=as.character(data2[,1]) -for(j in 1:nrow(data2)){ -data2[j,1]=nsplits[[i]][j] -} -data3=rbind(data3,data2) -} -} - - -## Remove rows with multiple Entrez IDs from the original data frame (create reduces dataframe)## - -tt1=NULL; -for(i in 1:nrow(data)) -{ - -tt=length(nsplits[[i]]) -tt1=append(tt1,tt) - -} - -tt2=which(tt1>1) - - -data4=data[-tt2,] - -## Create final dataframe (Combine replicated dataframe with reduced dataframe)## - -data5= rbind(data4,data3) - - -##################Repeat again after duplicating Entrez IDs######## - -test11=data5 - -sum1=rowSums(test11[,-1]) - -test13=cbind(test11,sum1) -test14=test13[rev(order(test13$sum1)),] - - -#test15=unique(test14, by = "ENTREZ_GENE_ID") - -test16=test14[!duplicated(test14$ENTREZ_GENE_ID),] - - -###################################################################### -# Use merge function ################################################# - -merged1=merge(Ex_gse22886_96, Ex_22886_97, by="ENTREZ_GENE_ID", all=TRUE) -merged2=merge(merged1, Ex_2770_8300, by="ENTREZ_GENE_ID", all=TRUE) -merged3=merge(merged2, Ex_2770_96, by="ENTREZ_GENE_ID", all=TRUE) -merged4=merge(merged3, Ex_2770_97, by="ENTREZ_GENE_ID", all=TRUE) -merged5=merge(merged4, Ex_436769_570, by="ENTREZ_GENE_ID", all=TRUE) - - - -write.table(merged5, "Th2_Merged_Affy.txt", sep="\t") - - diff --git a/code/Validation_withCMap_STAT.R b/code/Validation_withCMap_STAT.R deleted file mode 100755 index 848eef50..00000000 --- a/code/Validation_withCMap_STAT.R +++ /dev/null @@ -1,30 +0,0 @@ -v_stat=NULL; - -p_model_genes=unique(data_out2[,1]) - -for (i in 1:length(p_model_genes)) { - -ind1=which(data_out2[,1]==p_model_genes[i]) -dat1=data_out2[ind1,] -total_aff=length(unique(dat1[,2])) - -n_aff_down=which(dat1[,3]<0.99) -n_aff_down_val=which(as.numeric(dat1[,6])>50) -n_aff_down_val=intersect(n_aff_down,n_aff_down_val) - -n_aff_down=length(unique(dat1[n_aff_down,2])) -n_aff_down_val=length(unique(dat1[n_aff_down_val,2])) - -n_aff_up=which(dat1[,3]=="Inf" | as.numeric(dat1[,3])>1) -n_aff_up_val=which(as.numeric(dat1[,6])< -50) -n_aff_up_val=intersect(n_aff_down,n_aff_down_val) - -n_aff_up=length(unique(dat1[n_aff_up,2])) -n_aff_up_val=length(unique(dat1[n_aff_up_val,2])) - -#d_s=((n_aff_down - n_aff_up)/total_aff) -v_s=paste(p_model_genes[i], total_aff, n_aff_down, n_aff_down_val, n_aff_up, n_aff_up_val, sep=" ") -v_stat=append(v_stat, v_s) -} - -write.table(v_stat, "V_STAT1_Sim50.txt") diff --git a/code/merge_R.R b/code/merge_R.R deleted file mode 100755 index 7ff2452d..00000000 --- a/code/merge_R.R +++ /dev/null @@ -1,12 +0,0 @@ -## Merge two data files together. -# Create text file for datasets with HGNC as first column followed by 1's and 0's. -> path1 = "~/Documents/datafile1.txt" -> data1=read.table(path1,header=T) -> path2 = "~/Documents/datafile2.txt" -> data2=read.table(path2,header=T) -> mydata1=data1 -> mydata2=data2 -> mydataset=merge(mydata1, mydata2, by="HGNC",all=TRUE) -> write.table(mydataset,"~/Desktop/merged_file.txt",F) - -# Repeat. Use the "merged_file.txt" as path1 for subsequent merges, dataset to be incorporated as path2. \ No newline at end of file diff --git a/data/GSE2770_RAW/GSM60349.CEL.gz b/data/GSE2770_RAW/GSM60349.CEL.gz deleted file mode 100644 index 21cc4eb0..00000000 Binary files a/data/GSE2770_RAW/GSM60349.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60350.CEL.gz b/data/GSE2770_RAW/GSM60350.CEL.gz deleted file mode 100644 index 72262ec1..00000000 Binary files a/data/GSE2770_RAW/GSM60350.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60351.cel.gz b/data/GSE2770_RAW/GSM60351.cel.gz deleted file mode 100644 index f08b8b39..00000000 Binary files a/data/GSE2770_RAW/GSM60351.cel.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60352.cel.gz b/data/GSE2770_RAW/GSM60352.cel.gz deleted file mode 100644 index de99ebde..00000000 Binary files a/data/GSE2770_RAW/GSM60352.cel.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60353.cel.gz b/data/GSE2770_RAW/GSM60353.cel.gz deleted file mode 100644 index 29323e1c..00000000 Binary files a/data/GSE2770_RAW/GSM60353.cel.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60354.CEL.gz b/data/GSE2770_RAW/GSM60354.CEL.gz deleted file mode 100644 index 44306fb5..00000000 Binary files a/data/GSE2770_RAW/GSM60354.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60355.CEL.gz b/data/GSE2770_RAW/GSM60355.CEL.gz deleted file mode 100644 index 340653e8..00000000 Binary files a/data/GSE2770_RAW/GSM60355.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60356.CEL.gz b/data/GSE2770_RAW/GSM60356.CEL.gz deleted file mode 100644 index cac9e47e..00000000 Binary files a/data/GSE2770_RAW/GSM60356.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60357.CEL.gz b/data/GSE2770_RAW/GSM60357.CEL.gz deleted file mode 100644 index 9107f420..00000000 Binary files a/data/GSE2770_RAW/GSM60357.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60358.CEL.gz b/data/GSE2770_RAW/GSM60358.CEL.gz deleted file mode 100644 index 036973ae..00000000 Binary files a/data/GSE2770_RAW/GSM60358.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60359.CEL.gz b/data/GSE2770_RAW/GSM60359.CEL.gz deleted file mode 100644 index f664975a..00000000 Binary files a/data/GSE2770_RAW/GSM60359.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60360.CEL.gz b/data/GSE2770_RAW/GSM60360.CEL.gz deleted file mode 100644 index c86985a0..00000000 Binary files a/data/GSE2770_RAW/GSM60360.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60361.CEL.gz b/data/GSE2770_RAW/GSM60361.CEL.gz deleted file mode 100644 index f5413d26..00000000 Binary files a/data/GSE2770_RAW/GSM60361.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60362.CEL.gz b/data/GSE2770_RAW/GSM60362.CEL.gz deleted file mode 100644 index 55d87dd9..00000000 Binary files a/data/GSE2770_RAW/GSM60362.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60363.CEL.gz b/data/GSE2770_RAW/GSM60363.CEL.gz deleted file mode 100644 index fab283a0..00000000 Binary files a/data/GSE2770_RAW/GSM60363.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60364.CEL.gz b/data/GSE2770_RAW/GSM60364.CEL.gz deleted file mode 100644 index 29a81566..00000000 Binary files a/data/GSE2770_RAW/GSM60364.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60365.CEL.gz b/data/GSE2770_RAW/GSM60365.CEL.gz deleted file mode 100644 index bfac92b9..00000000 Binary files a/data/GSE2770_RAW/GSM60365.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60366.CEL.gz b/data/GSE2770_RAW/GSM60366.CEL.gz deleted file mode 100644 index fe673123..00000000 Binary files a/data/GSE2770_RAW/GSM60366.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60367.CEL.gz b/data/GSE2770_RAW/GSM60367.CEL.gz deleted file mode 100644 index 62a05d4d..00000000 Binary files a/data/GSE2770_RAW/GSM60367.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60368.CEL.gz b/data/GSE2770_RAW/GSM60368.CEL.gz deleted file mode 100644 index e163b247..00000000 Binary files a/data/GSE2770_RAW/GSM60368.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60369.CEL.gz b/data/GSE2770_RAW/GSM60369.CEL.gz deleted file mode 100644 index ee403c35..00000000 Binary files a/data/GSE2770_RAW/GSM60369.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60370.CEL.gz b/data/GSE2770_RAW/GSM60370.CEL.gz deleted file mode 100644 index 61dcb9c4..00000000 Binary files a/data/GSE2770_RAW/GSM60370.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60371.CEL.gz b/data/GSE2770_RAW/GSM60371.CEL.gz deleted file mode 100644 index 25d95f52..00000000 Binary files a/data/GSE2770_RAW/GSM60371.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60372.CEL.gz b/data/GSE2770_RAW/GSM60372.CEL.gz deleted file mode 100644 index d3fdd713..00000000 Binary files a/data/GSE2770_RAW/GSM60372.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60373.CEL.gz b/data/GSE2770_RAW/GSM60373.CEL.gz deleted file mode 100644 index 56e99387..00000000 Binary files a/data/GSE2770_RAW/GSM60373.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60374.CEL.gz b/data/GSE2770_RAW/GSM60374.CEL.gz deleted file mode 100644 index 77d2afd1..00000000 Binary files a/data/GSE2770_RAW/GSM60374.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60375.CEL.gz b/data/GSE2770_RAW/GSM60375.CEL.gz deleted file mode 100644 index 8a204702..00000000 Binary files a/data/GSE2770_RAW/GSM60375.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60376.CEL.gz b/data/GSE2770_RAW/GSM60376.CEL.gz deleted file mode 100644 index afb7e1fd..00000000 Binary files a/data/GSE2770_RAW/GSM60376.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60377.CEL.gz b/data/GSE2770_RAW/GSM60377.CEL.gz deleted file mode 100644 index 3bb50875..00000000 Binary files a/data/GSE2770_RAW/GSM60377.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60378.CEL.gz b/data/GSE2770_RAW/GSM60378.CEL.gz deleted file mode 100644 index 3f7ee227..00000000 Binary files a/data/GSE2770_RAW/GSM60378.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60379.CEL.gz b/data/GSE2770_RAW/GSM60379.CEL.gz deleted file mode 100644 index be7a9731..00000000 Binary files a/data/GSE2770_RAW/GSM60379.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60380.CEL.gz b/data/GSE2770_RAW/GSM60380.CEL.gz deleted file mode 100644 index ceac87e1..00000000 Binary files a/data/GSE2770_RAW/GSM60380.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60381.CEL.gz b/data/GSE2770_RAW/GSM60381.CEL.gz deleted file mode 100644 index ad8ecba8..00000000 Binary files a/data/GSE2770_RAW/GSM60381.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60699.CEL.gz b/data/GSE2770_RAW/GSM60699.CEL.gz deleted file mode 100644 index d075c0f8..00000000 Binary files a/data/GSE2770_RAW/GSM60699.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60700.CEL.gz b/data/GSE2770_RAW/GSM60700.CEL.gz deleted file mode 100644 index 613e9b79..00000000 Binary files a/data/GSE2770_RAW/GSM60700.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60701.CEL.gz b/data/GSE2770_RAW/GSM60701.CEL.gz deleted file mode 100644 index 0e61d394..00000000 Binary files a/data/GSE2770_RAW/GSM60701.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60702.CEL.gz b/data/GSE2770_RAW/GSM60702.CEL.gz deleted file mode 100644 index 6e8c32c6..00000000 Binary files a/data/GSE2770_RAW/GSM60702.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60703.CEL.gz b/data/GSE2770_RAW/GSM60703.CEL.gz deleted file mode 100644 index 9d56706c..00000000 Binary files a/data/GSE2770_RAW/GSM60703.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60704.CEL.gz b/data/GSE2770_RAW/GSM60704.CEL.gz deleted file mode 100644 index e467d53c..00000000 Binary files a/data/GSE2770_RAW/GSM60704.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60705.CEL.gz b/data/GSE2770_RAW/GSM60705.CEL.gz deleted file mode 100644 index 3be192bd..00000000 Binary files a/data/GSE2770_RAW/GSM60705.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60706.CEL.gz b/data/GSE2770_RAW/GSM60706.CEL.gz deleted file mode 100644 index 7e3d68b2..00000000 Binary files a/data/GSE2770_RAW/GSM60706.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60707.CEL.gz b/data/GSE2770_RAW/GSM60707.CEL.gz deleted file mode 100644 index bbf808f1..00000000 Binary files a/data/GSE2770_RAW/GSM60707.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60708.CEL.gz b/data/GSE2770_RAW/GSM60708.CEL.gz deleted file mode 100644 index 21949947..00000000 Binary files a/data/GSE2770_RAW/GSM60708.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60709.CEL.gz b/data/GSE2770_RAW/GSM60709.CEL.gz deleted file mode 100644 index af6d9082..00000000 Binary files a/data/GSE2770_RAW/GSM60709.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60710.CEL.gz b/data/GSE2770_RAW/GSM60710.CEL.gz deleted file mode 100644 index e5474b89..00000000 Binary files a/data/GSE2770_RAW/GSM60710.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60711.CEL.gz b/data/GSE2770_RAW/GSM60711.CEL.gz deleted file mode 100644 index db4624dd..00000000 Binary files a/data/GSE2770_RAW/GSM60711.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60712.CEL.gz b/data/GSE2770_RAW/GSM60712.CEL.gz deleted file mode 100644 index d28f0e51..00000000 Binary files a/data/GSE2770_RAW/GSM60712.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60713.CEL.gz b/data/GSE2770_RAW/GSM60713.CEL.gz deleted file mode 100644 index 05bb3778..00000000 Binary files a/data/GSE2770_RAW/GSM60713.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60714.CEL.gz b/data/GSE2770_RAW/GSM60714.CEL.gz deleted file mode 100644 index 6539d8d1..00000000 Binary files a/data/GSE2770_RAW/GSM60714.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60715.CEL.gz b/data/GSE2770_RAW/GSM60715.CEL.gz deleted file mode 100644 index 84f3f7bc..00000000 Binary files a/data/GSE2770_RAW/GSM60715.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60716.CEL.gz b/data/GSE2770_RAW/GSM60716.CEL.gz deleted file mode 100644 index 7fbed5c5..00000000 Binary files a/data/GSE2770_RAW/GSM60716.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60717.CEL.gz b/data/GSE2770_RAW/GSM60717.CEL.gz deleted file mode 100644 index 96131f24..00000000 Binary files a/data/GSE2770_RAW/GSM60717.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60718.CEL.gz b/data/GSE2770_RAW/GSM60718.CEL.gz deleted file mode 100644 index 767c7abc..00000000 Binary files a/data/GSE2770_RAW/GSM60718.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60719.CEL.gz b/data/GSE2770_RAW/GSM60719.CEL.gz deleted file mode 100644 index 60968d11..00000000 Binary files a/data/GSE2770_RAW/GSM60719.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60720.CEL.gz b/data/GSE2770_RAW/GSM60720.CEL.gz deleted file mode 100644 index 62639350..00000000 Binary files a/data/GSE2770_RAW/GSM60720.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60721.CEL.gz b/data/GSE2770_RAW/GSM60721.CEL.gz deleted file mode 100644 index fd9ac448..00000000 Binary files a/data/GSE2770_RAW/GSM60721.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60722.CEL.gz b/data/GSE2770_RAW/GSM60722.CEL.gz deleted file mode 100644 index 488d5e0e..00000000 Binary files a/data/GSE2770_RAW/GSM60722.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60723.CEL.gz b/data/GSE2770_RAW/GSM60723.CEL.gz deleted file mode 100644 index ba73a96b..00000000 Binary files a/data/GSE2770_RAW/GSM60723.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60724.CEL.gz b/data/GSE2770_RAW/GSM60724.CEL.gz deleted file mode 100644 index eb3cbec7..00000000 Binary files a/data/GSE2770_RAW/GSM60724.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60725.CEL.gz b/data/GSE2770_RAW/GSM60725.CEL.gz deleted file mode 100644 index a0a23172..00000000 Binary files a/data/GSE2770_RAW/GSM60725.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60726.CEL.gz b/data/GSE2770_RAW/GSM60726.CEL.gz deleted file mode 100644 index 145e6c9d..00000000 Binary files a/data/GSE2770_RAW/GSM60726.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60727.CEL.gz b/data/GSE2770_RAW/GSM60727.CEL.gz deleted file mode 100644 index d67f8a2f..00000000 Binary files a/data/GSE2770_RAW/GSM60727.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60728.CEL.gz b/data/GSE2770_RAW/GSM60728.CEL.gz deleted file mode 100644 index a05e0986..00000000 Binary files a/data/GSE2770_RAW/GSM60728.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60729.CEL.gz b/data/GSE2770_RAW/GSM60729.CEL.gz deleted file mode 100644 index 2fd09627..00000000 Binary files a/data/GSE2770_RAW/GSM60729.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60730.CEL.gz b/data/GSE2770_RAW/GSM60730.CEL.gz deleted file mode 100644 index 76bcd635..00000000 Binary files a/data/GSE2770_RAW/GSM60730.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60731.CEL.gz b/data/GSE2770_RAW/GSM60731.CEL.gz deleted file mode 100644 index af103b3c..00000000 Binary files a/data/GSE2770_RAW/GSM60731.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60732.CEL.gz b/data/GSE2770_RAW/GSM60732.CEL.gz deleted file mode 100644 index 306af3ed..00000000 Binary files a/data/GSE2770_RAW/GSM60732.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60733.CEL.gz b/data/GSE2770_RAW/GSM60733.CEL.gz deleted file mode 100644 index 55b17374..00000000 Binary files a/data/GSE2770_RAW/GSM60733.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60734.CEL.gz b/data/GSE2770_RAW/GSM60734.CEL.gz deleted file mode 100644 index fcc68deb..00000000 Binary files a/data/GSE2770_RAW/GSM60734.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60735.CEL.gz b/data/GSE2770_RAW/GSM60735.CEL.gz deleted file mode 100644 index 472f55ac..00000000 Binary files a/data/GSE2770_RAW/GSM60735.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60736.CEL.gz b/data/GSE2770_RAW/GSM60736.CEL.gz deleted file mode 100644 index 099f8649..00000000 Binary files a/data/GSE2770_RAW/GSM60736.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60737.CEL.gz b/data/GSE2770_RAW/GSM60737.CEL.gz deleted file mode 100644 index d3568a8a..00000000 Binary files a/data/GSE2770_RAW/GSM60737.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60738.CEL.gz b/data/GSE2770_RAW/GSM60738.CEL.gz deleted file mode 100644 index e40b7727..00000000 Binary files a/data/GSE2770_RAW/GSM60738.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60739.CEL.gz b/data/GSE2770_RAW/GSM60739.CEL.gz deleted file mode 100644 index ded90f9b..00000000 Binary files a/data/GSE2770_RAW/GSM60739.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60740.CEL.gz b/data/GSE2770_RAW/GSM60740.CEL.gz deleted file mode 100644 index e96affb8..00000000 Binary files a/data/GSE2770_RAW/GSM60740.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60741.CEL.gz b/data/GSE2770_RAW/GSM60741.CEL.gz deleted file mode 100644 index 5431fbed..00000000 Binary files a/data/GSE2770_RAW/GSM60741.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60742.CEL.gz b/data/GSE2770_RAW/GSM60742.CEL.gz deleted file mode 100644 index f56827fe..00000000 Binary files a/data/GSE2770_RAW/GSM60742.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60743.CEL.gz b/data/GSE2770_RAW/GSM60743.CEL.gz deleted file mode 100644 index e8a25797..00000000 Binary files a/data/GSE2770_RAW/GSM60743.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60744.CEL.gz b/data/GSE2770_RAW/GSM60744.CEL.gz deleted file mode 100644 index fcf6147f..00000000 Binary files a/data/GSE2770_RAW/GSM60744.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60745.CEL.gz b/data/GSE2770_RAW/GSM60745.CEL.gz deleted file mode 100644 index 579fb0e6..00000000 Binary files a/data/GSE2770_RAW/GSM60745.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60746.CEL.gz b/data/GSE2770_RAW/GSM60746.CEL.gz deleted file mode 100644 index 6249870a..00000000 Binary files a/data/GSE2770_RAW/GSM60746.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60747.CEL.gz b/data/GSE2770_RAW/GSM60747.CEL.gz deleted file mode 100644 index 0c84fc03..00000000 Binary files a/data/GSE2770_RAW/GSM60747.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60748.CEL.gz b/data/GSE2770_RAW/GSM60748.CEL.gz deleted file mode 100644 index be429915..00000000 Binary files a/data/GSE2770_RAW/GSM60748.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60749.CEL.gz b/data/GSE2770_RAW/GSM60749.CEL.gz deleted file mode 100644 index bda08e77..00000000 Binary files a/data/GSE2770_RAW/GSM60749.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60750.CEL.gz b/data/GSE2770_RAW/GSM60750.CEL.gz deleted file mode 100644 index 3a12b287..00000000 Binary files a/data/GSE2770_RAW/GSM60750.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60751.CEL.gz b/data/GSE2770_RAW/GSM60751.CEL.gz deleted file mode 100644 index 595141d9..00000000 Binary files a/data/GSE2770_RAW/GSM60751.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60752.CEL.gz b/data/GSE2770_RAW/GSM60752.CEL.gz deleted file mode 100644 index 2b41b6db..00000000 Binary files a/data/GSE2770_RAW/GSM60752.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60753.CEL.gz b/data/GSE2770_RAW/GSM60753.CEL.gz deleted file mode 100644 index 253252bd..00000000 Binary files a/data/GSE2770_RAW/GSM60753.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60754.CEL.gz b/data/GSE2770_RAW/GSM60754.CEL.gz deleted file mode 100644 index a3ee71f1..00000000 Binary files a/data/GSE2770_RAW/GSM60754.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60755.CEL.gz b/data/GSE2770_RAW/GSM60755.CEL.gz deleted file mode 100644 index ea749806..00000000 Binary files a/data/GSE2770_RAW/GSM60755.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60756.CEL.gz b/data/GSE2770_RAW/GSM60756.CEL.gz deleted file mode 100644 index d5911626..00000000 Binary files a/data/GSE2770_RAW/GSM60756.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60757.CEL.gz b/data/GSE2770_RAW/GSM60757.CEL.gz deleted file mode 100644 index e12fd39b..00000000 Binary files a/data/GSE2770_RAW/GSM60757.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60758.CEL.gz b/data/GSE2770_RAW/GSM60758.CEL.gz deleted file mode 100644 index 29ef0935..00000000 Binary files a/data/GSE2770_RAW/GSM60758.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60759.CEL.gz b/data/GSE2770_RAW/GSM60759.CEL.gz deleted file mode 100644 index 178ec16e..00000000 Binary files a/data/GSE2770_RAW/GSM60759.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60760.CEL.gz b/data/GSE2770_RAW/GSM60760.CEL.gz deleted file mode 100644 index 4cbc8710..00000000 Binary files a/data/GSE2770_RAW/GSM60760.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60761.CEL.gz b/data/GSE2770_RAW/GSM60761.CEL.gz deleted file mode 100644 index 5c65a81a..00000000 Binary files a/data/GSE2770_RAW/GSM60761.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60762.CEL.gz b/data/GSE2770_RAW/GSM60762.CEL.gz deleted file mode 100644 index 7630ad61..00000000 Binary files a/data/GSE2770_RAW/GSM60762.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60763.CEL.gz b/data/GSE2770_RAW/GSM60763.CEL.gz deleted file mode 100644 index 751e13e4..00000000 Binary files a/data/GSE2770_RAW/GSM60763.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60764.CEL.gz b/data/GSE2770_RAW/GSM60764.CEL.gz deleted file mode 100644 index 132d1605..00000000 Binary files a/data/GSE2770_RAW/GSM60764.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60765.CEL.gz b/data/GSE2770_RAW/GSM60765.CEL.gz deleted file mode 100644 index 946bd53e..00000000 Binary files a/data/GSE2770_RAW/GSM60765.CEL.gz and /dev/null differ diff --git a/data/GSE2770_RAW/GSM60766.CEL.gz b/data/GSE2770_RAW/GSM60766.CEL.gz deleted file mode 100644 index c52287d7..00000000 Binary files a/data/GSE2770_RAW/GSM60766.CEL.gz and /dev/null differ diff --git a/doc/IMG_1.jpg b/doc/IMG_1.jpg deleted file mode 100755 index 1502fbe1..00000000 Binary files a/doc/IMG_1.jpg and /dev/null differ diff --git a/doc/IMG_2.jpg b/doc/IMG_2.jpg deleted file mode 100755 index b50d74a4..00000000 Binary files a/doc/IMG_2.jpg and /dev/null differ diff --git a/doc/IMG_3.jpg b/doc/IMG_3.jpg deleted file mode 100755 index 76d64532..00000000 Binary files a/doc/IMG_3.jpg and /dev/null differ diff --git a/doc/IMG_4.jpg b/doc/IMG_4.jpg deleted file mode 100755 index f5ed0908..00000000 Binary files a/doc/IMG_4.jpg and /dev/null differ diff --git a/docker-compose.yml b/docker-compose.yml new file mode 100644 index 00000000..310b864f --- /dev/null +++ b/docker-compose.yml @@ -0,0 +1,9 @@ +services: + como: + image: ghcr.io/helikarlab/como:latest + container_name: como + ports: + - "8888:8888" + volumes: + # Map "[LOCAL_DIRECTORY]/local_files to /app/main/data/local_files + - "$(pwd)/local_files:/app/main/data/local_files" \ No newline at end of file diff --git a/environment.yaml b/environment.yaml new file mode 100644 index 00000000..ff53db13 --- /dev/null +++ b/environment.yaml @@ -0,0 +1,69 @@ +name: como +channels: + - conda-forge + - bioconda + - gurobi + +# Use the ~= specifier to allow any minor updates +# For example, "~=4.0" requires version 4.0, but does not allow version 5.0 +dependencies: + - bioconda::bioconductor-affyio~=1.64.0 + - bioconda::bioconductor-affy~=1.72.0 + - bioconda::bioconductor-agilp~=3.26.0 + - bioconda::bioconductor-biomart~=2.50.0 + - bioconda::bioconductor-deseq2~=1.34.0 + - bioconda::bioconductor-edger~=3.36.0 + - bioconda::bioconductor-genefilter~=1.76.0 + # - bioconda::bioconductor-genomeinfodbdata~=1.2.11 # Required or else hgu133acdf fails to install + - bioconda::bioconductor-hgu133acdf~=2.18.0 + # - bioconda::bioconductor-limma~=3.50.1 + - bioconda::crux-toolkit~=4.1 + - bioconda::thermorawfileparser~=1.4.0 + - conda-forge::aioftp~=0.21.2 + # - conda-forge::bioservices~=1.11.2 + - conda-forge::cobra~=0.29.0 + - conda-forge::geoparse~=2.0.3 + - conda-forge::git~=2.37.0 # Required for pip-related dependencies + - conda-forge::jupyterlab~=4.0.0 + - conda-forge::lxml~=4.9.1 + - conda-forge::numpy~=1.23.0 + - conda-forge::openpyxl~=3.0.10 + # - conda-forge::optlang~=1.5.2 + - conda-forge::pandas<=3.0.0 + - conda-forge::pip + - conda-forge::python-libsbml~=5.19.2 + - conda-forge::python~=3.10 + - conda-forge::r-base + - conda-forge::r-biocmanager~=1.30.18 + - conda-forge::r-devtools~=2.4.3 + - conda-forge::r-factominer~=2.8 + - conda-forge::r-ggrepel~=0.9.1 + - conda-forge::r-irdisplay~=1.1 + - conda-forge::r-irkernel~=1.3 + - conda-forge::r-locfit~=1.5_9.5 + - conda-forge::r-readxl~=1.4.0 + - conda-forge::r-repr~=1.1.4 + - conda-forge::r-rzmq~=0.9.8 + - conda-forge::r-sjmisc~=2.8.9 + - conda-forge::r-stringr~=1.4.0 + - conda-forge::r-tidyverse~=1.3.1 + - conda-forge::r-uwot~=0.1.11 + - conda-forge::r-zoo~=1.8_10 + # - conda-forge::requests~=2.28.1 + - conda-forge::rpy2~=3.5.1 + # - conda-forge::scipy~=1.8.1 + - conda-forge::sqlalchemy~=1.4.39 + - conda-forge::tqdm~=4.64.1 + - conda-forge::toml~=0.10.2 + # - conda-forge::unidecode~=1.3.4 + # - conda-forge::wget~=1.20.3 + # - conda-forge::xlrd~=2.0.1 + - gurobi::gurobi + - pip: + - git+https://github.com/JoshLoecker/fast_bioservices + # - escher==1.7.3 + - git+https://github.com/JoshLoecker/escher.git@python38#subdirectory=py + - framed==0.5.* + - memote<=1.0 + - git+https://github.com/JoshLoecker/cobamp.git + - git+https://github.com/JoshLoecker/troppo.git diff --git a/main/COMO.ipynb b/main/COMO.ipynb new file mode 100644 index 00000000..df6df0b3 --- /dev/null +++ b/main/COMO.ipynb @@ -0,0 +1,1175 @@ +{ + "cells": [ + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# COMO: Constraint-based Optomization of Metabolic Objectives\n", + "\n", + "COMO is used to build computational models that simulate the biochemical and phisiological processes that occur in a cell or organism, known as constraint-based metabolic models. The basic idea behind a constraint-based metabolic model is to use a set of constraints to place boundaries on the system being modeled. These constraints may include (but are not limited to) limits on the availability of nutrients, energy requirements, and the maximum rates of metabolic reactions. COMO imposes these constraints within a specific context. This context includes the cell or tissue type being modeled, along with its disease state. In addition to creating metabolic models, COMO serves as a platform to identify (1) drug targets and (2) repurposable drugs for metabolism-impacting diseases.\n", + "\n", + "\n", + "This pipeline has everything necessary to build a model from any combination of the following sources:\n", + "- Bulk RNA-seq (total and mRNA)\n", + "- Single-cell RNA-seq\n", + "- Proteomics\n", + "\n", + "\n", + "COMO does not require programming experience to create models. However, every step of the pipeline is easily accessable to promote modification, addition, or replacement of analysis steps. In addition, this docker container comes pre-loaded with popular R and Python libraries; if you would like to use a library and cannot install it for any reason, please [request it on our GitHub page](https://github.com/HelikarLab/COMO)!\n", + "\n", + "\n", + "

\n", + "⚠️ WARNING ⚠️\n", + "

\n", + "\n", + "If you terminate your session after running Docker, any changes you make *will **not** be saved*. Please mount a local directory to the docker image, [as instructed on the GitHub page](https://helikarlab.github.io/COMO/#choosing-a-tag), to prevent data loss.\n", + "\n", + "# Before Starting\n", + "## Input Files\n", + "The proper input files, dependent on the types of data you are using, must be loaded before model creation. Some example files are included to build metabolic models of naive, Th1, Th2, and Th17 T-cell subtypes, and identify targets for rheumatoid arthritis.\n", + "\n", + "### RNA-seq\n", + "A correctly formatted folder named \"COMO inputs\" in the data directory. Proper inputs can be generated using our Snakemake pipeline, [FastqToGeneCounts](https://github.com/HelikarLab/FastqToGeneCounts), which is specifically designed for use with COMO. RNA sequencing data can be single-cell, or bulk, but the provided Snakemake pipeline does not process single-cell data as of now. If you are processing RNA-seq data with an alternate procedure or importing a pre-made gene count matrix, follow the instructions [listed under Step 1](#Importing-a-Pre-Generated-Counts-Matrix)\n", + "\n", + "### Proteomics\n", + "A matrix of measurement values, where rows are protein names in Entrez format and columns are sample names\n", + "\n", + "## Configuration Information\n", + "You should upload configuration files (in Excel format, `.xlsx`) to `data/config_sheets`. The sheet names in these configuration files should correspond to the context (tissue name, cell name, etc.). The data in each sheet contains the sample names to include in that context-specific model. These sample names should correspond to the column name in the source data matrix, which will be output (or uploaded, if you have your own data) to `data/data_matrices/MODEL-NAME`\n", + "\n", + "# Drug Target Identification\n", + "\n", + "1. Preprocess Bulk RNA-seq data\n", + " 1. Convert STAR-output gene count files into a unified matrix\n", + " 2. Fetch necessary information about each gene in the matrix\n", + " 3. Generate a configuration file\n", + "2. Analyze any combination of RNA-seq or proteomics data, and output a list of active genes for each strategy\n", + "3. Check for a consensus amongst strategies according to a desired rigor and merge into a singular set of active genes\n", + "4. Create a tissue-specific model based on the list of active genes (from Step 3)\n", + "5. Identify differential gene expression from disease datasets using RNA-seq transcriptomics information\n", + "6. Identify drug targets and repurposable drugs. This step consists of four substeps:\n", + " 1. Map drugs to models\n", + " 2. Knock-out simulation\n", + " 3. Compare results between perturbed and unperturbed models (i.e., knocked-out models vs non-knocked-out models)\n", + " 4. Integrate with disease genes and create a score of drug targets" + ], + "id": "227da2b778760c93" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Step 1: Data Preprocessing and Analysis\n", + "\n", + "The first step of COMO will perform processing and analysis on each of the following data:\n", + "- Total RNA sequencing\n", + "- mRNA sequencing\n", + "- Proteomics\n", + "\n", + "## RNA-seq Data\n", + "RNA sequencing data is read by COMO as a count matrix, where each column is a different sample or replicate named \"tissueName_SXRYrZ\", where:\n", + "- \"`X`\" represents the study (or batch) number. Each study represents a new experiment\n", + "- \"`Y`\" represents the replicate number\n", + "- \"`Z`\" represents the run number. If the replicate does not contain multiple runs for a single replicate, then \"`rZ`\" should not be included.\n", + "- \"`tissueName`\" represents the name of the model that will be built from this data. It should be consistent with other data sources if you would like them to be integrated.\n", + "\n", + "❗The `tissueName` identifier should not contain any special characters, including `_`. Doing so may interfere with parsing throughout this pipeline.\n", + "\n", + "Replicates should come from the same study or batch group. Different studies/batches can come from different published studies, as long as the tissue/cell was under similar enough conditions for your personal modeling purposes. \"Run numbers\" in the same replicate will be summed together.\n", + "\n", + "### Example\n", + "Pretend `S1` represents a study done by Margaret and `S2` represents a different study done by John. Margaret's experiment contains three replicates, while John's only contains two. Each of these studies comes from m0 Macrophages. Using this cell name, we will set our tissue name to `m0Macro`. The studies were conducted in different labs, by different researches, at different points in time, even using different preparation kits. . Using this information, we have the following samples:\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
m0 Macrophage Data
Margaret's DataJohn's Data
StudyReplicateResulting NameStudyReplicateResulting Name
S1R1m0Macro_S1R1S2R1m0Macro_S2R1
S1R2m0Macro_S1R2S2R2m0Macro_S2R2
S1R3m0Macro_S1R3---
\n", + "\n", + "From the `Resulting Name` column, the `m0Macro_S1R1`, `m0Macro_S1R2`, and `m0Macro_S1R3` samples (Margaret's data) will be checked for gene expression consensus to generate a list of active genes in all three replicates. The same will be done for `m0Macro_S2R1` and `m0Macro_S2R2` (John's data). Once these two *separate* lists of active genes have been generated, expression *between* lists will be checked for additional consensus between the studies. This system is used not only to help maintain organization throughout COMO, but because most types of normalized gene counts cannot undergo direct comparisons across replicates. This is especially true for comparisons between different experiments. Therefore, COMO will convert normalized gene counts into a boolean list of active genes. These lists will be compared at the level of replicates in a study, and then again at the level of all provided studies. Finally, the active genes will be merged with the outputs of proteomics and various RNA-sequencing strategies if provided. The rigor used at each level is easily modifiable.\n", + "\n", + "\n", + "### Initializing RNA-seq Data\n", + "\n", + "Please choose an option below:\n", + "1. Importing a `COMO inputs` directory\n", + " 1. [Initialization using the Snakemake Pipeline](https://github.com/HelikarLab/FastqToGeneCounts)\n", + " 2. [Creating your own Inputs](#Creating-a-Properly-Formatted-COMO-inputs-Folder)\n", + "2. [Importing a pre-generated gene counts file](#Importing-a-Pre-Generated-Counts-Matrix)\n", + "\n", + "#### Snakemake Pipeline\n", + "It is recommended you use the available Snakemake pipeline to align to create a properly formatted `COMO inputs` folder. The pipeline also runs a series of quality control steps to help determine if any of the provided samples are not suitable for model creation. This pipeline can be found at https://github.com/HelikarLab/FastqToGeneCounts.\n", + "\n", + "The folder output from the snakemake pipeline can be uploaded directly to the folder `data/COMO inputs` in this pipeline\n", + "\n", + "Once this is done, continue to the code block at the end of this section\n", + "\n", + "#### Creating a Properly Formatted `COMO inputs` Folder\n", + "\n", + "\n", + "If you are using your own alignment protocol, follow this section to create a properly formatted `COMO inputs` folder.\n", + "\n", + "The top-level of the directory will have separate tissue/cell types that models should be created from. The next level must have a folder called `geneCounts`, and optionally a `strandedness` folder. If you are using zFPKM normalization, two additional folders must be included: `layouts` and `fragmentSizes`. Inside each of these folders should be folders named `SX`, where `X` is a number that replicates are associated with.\n", + "\n", + "
\n", + "\n", + "Gene Counts\n", + "Create a folder named `geneCounts`. The outputs of the STAR aligner using the `-quantMode GeneCounts` option should be included inside the \"study-number\" folders (`SX`) of `geneCounts`. To help you (and COMO!) stay organized, these outputs should be renamed `tissueName_SXRYrZ.tab`. Just like above, `X` is the study number, `Y` is the replicate number, and (if present), `Z` is the run number. If the replicate does not contain multiple runs, the `rZ` should be excluded from the name. Replicates should come from the same study/sample group. Different samples can come from different published studies as long as the experiments were performed under similar enough conditions for your modeling purposes.\n", + "\n", + "Strandedness\n", + "Create a folder named `strandedness`. This folder should contain files named `tissueName_SXRYrZ_strandedness.txt`. These files must tell the strandedness of the RNA-sequencing method used. It should contain one of the following texts (and nothing else):\n", + " - `NONE`: If you don't know the strandedness\n", + " - `FIRST_READ_TRANSCRIPTION_STRAND`: If this RNA-sequencing sample originates from the first strand of cDNA, or the \"antisense\" strand\n", + " - `SECOND_READ_TRANSCRIPTION_STRAND`: If this RNA-sequencing sample originates from the second strand of cDNA, or the \"sense\" strand\n", + "\n", + "Layouts\n", + "Create a folder a folder named `layouts`. Files should be named `tissueName_SXRYrZ_layout.txt, where each file tells the layout of the library used. It must contain one of the following texts, and nothing else:\n", + "- `paired-end`: Paired-end reads were generated\n", + "- `single-end`: Single-end reads were generated\n", + "\n", + "Fragment Sizes\n", + "Create a folder named `fragmentSizes`. Files should be named `tissueName_SXRYrZ_fragment_sizes.txt` and contain the output of [RSeQC](https://rseqc.sourceforge.net/)'s `como/RNA_fragment_size.py` function.\n", + "\n", + "Preparation Methods\n", + "Create a folder named `prepMethods`. Files should be named `tissueName_SXRYrZ_prep_method.txt`. Each file should tell the library preparation strategy. It must contain one of the following texts, and nothing else:\n", + "- `total`: All mRNA expression was measured (mRNA, ncRNA, rRNA, etc.)\n", + "- `mRNA`: Only polyA mRNA expression was measured\n", + "\n", + "It should be noted that these strategies only serve to differentiate the methods in the event that both are used to build a model. If a different library strategy is desired, you have two options:\n", + "1. Replace one of these with a placeholder. If you only have polyA mRNA expression, you only have to enter data for those samples. Do not fill out any samples with `total`.\n", + "2. With a little Python knowledge, a new strategy can easily be added to the `como/merge_xomics.py` file. If you would like to do so, the file is located under `como/merge_xomics.py` in this Jupyter Notebook\n", + "\n", + "#### Importing a Pre-Generated Counts Matrix\n", + "Import a properly formatted counts matrix to `data/data_matrices/exampleTissue/gene_counts_matrix_exampleTissue.csv`. The rows should be named `exampleTissue_SXRY` (note the lack of a run number (`rZ`), runs should be summed into each replicate). If you are providing the count matrix this way, instead of generating one using the snakemake pipeline mentioned above, you must create a configuration file that has each sample's name, study number, and if using zFPKM, layout and mean fragment length. Use the provided template below to create yours. Once you have created this file and placed it under the `data/data_matrices/exampleTissue` directory, run the `como/rnaseq_preprocess.py` file with `preprocess-mode` set to `provide-matrix`.\n", + "\n", + "This method is best if you are downloading a premade count matrix, or using single-cell data that has already been batch corrected, clustered, and sorted into only the cell type of interest!\n", + "\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Example Gene Count Table
genesexampleTissue_S1R1exampleTissue_S1R2exampleTissue_S2R1exampleTissue_S2R2
ENSG0000000000320295271
ENSG000000000050000
ENSG000000004191354208117603400
" + ], + "id": "583a9cc77019b929" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from pathlib import Path\n", + "\n", + "\n", + "def get_notebook_dir() -> Path:\n", + " \"\"\"Get the directory of the COMO.ipynb notebook.\n", + "\n", + " This is used to set the proper input and output paths for the rest of the notebook\n", + "\n", + "\n", + " Returns:\n", + " The directory containing the `COMO.ipynb` notebook\n", + "\n", + " Raises:\n", + " FileNotFoundError: If the root directory could not be determined\n", + " \"\"\"\n", + " current_dir = Path().cwd()\n", + "\n", + " while True:\n", + " files = [i.name for i in current_dir.iterdir()]\n", + " if \"COMO.ipynb\" in files:\n", + " return current_dir\n", + " if current_dir.as_posix() == \"/\":\n", + " raise FileNotFoundError(\"Root directory could not be determined; unable to find 'COMO.ipynb'\")\n", + " current_dir = current_dir.parent" + ], + "id": "18b794c4e630179e" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from pathlib import Path\n", + "\n", + "from como.data_types import RNAType\n", + "from como.rnaseq_preprocess import rnaseq_preprocess\n", + "\n", + "taxon_id = 9606\n", + "# context_names = [\"control1hr\", \"control6hr\", \"control15min\", \"control24hr\", \"indralin1hr\", \"indralin6hr\",\n", + "# \"indralin15min\", \"indralin24hr\", \"notreatment\"]\n", + "context_names = [\"notreatment\"]\n", + "notebook_dir = get_notebook_dir()\n", + "\n", + "# Input Components\n", + "como_context_dir = {context: Path(notebook_dir / f\"data/COMO_input/{context}\") for context in context_names}\n", + "gene_info_filepath = {context: Path(notebook_dir / f\"data/results/{context}/gene_info.csv\") for context in\n", + " context_names}\n", + "trna_matrix_filepath = {context: Path(notebook_dir / f\"data/results/{context}/total-rna/totalrna_{context}.csv\") for\n", + " context in\n", + " context_names}\n", + "mrna_matrix_filepath = {context: Path(notebook_dir / f\"data/results/{context}/mrna/mrna_{context}.csv\") for context in\n", + " context_names}\n", + "proteomics_matrix_filepath = {\n", + " context: Path(notebook_dir / f\"data/data_matrices/{context}/protein_abundance_{context}.csv\") for context\n", + " in context_names}\n", + "\n", + "# No single-cell data is provided by default; COMO accepts single-cell data in CSV or h5ad format\n", + "# If you are using single-cell data, adjust the following lines accordingly\n", + "scrna_matrix_filepath = {context: Path(notebook_dir / f\"data/results/{context}/scrna/scrna_{context}.csv\") for context\n", + " in\n", + " context_names}\n", + "# scrna_matrix_filepath = [Path(f\"data/results/{context}/scrna/scrna_{context}.h5ad\") for context in context_names]\n", + "\n", + "trna_metadata_filepath = Path(notebook_dir / \"data/config_sheets/trna_config.xlsx\")\n", + "mrna_metadata_filepath = Path(notebook_dir / \"data/config_sheets/mrna_config.xlsx\")\n", + "proteomics_metadata_filepath = Path(notebook_dir / \"data/config_sheets/proteomics_config.xlsx\")\n" + ], + "id": "a1bfbaedc90f3090" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### RNA-seq Preprocessing Parameters\n", + "- `context_names`: The tissue/cell types to use. This is a simple space-separated list of items, such as \"naiveB regulatoryTcell\"\n", + "- `gene_format`: The format of input genes, accepts `\"Extrez\"`, `\"Emsembl\"` or `\"Symbol\"`\n", + "- `taxon_id`: The [NCBI Taxon ID](https://www.ncbi.nlm.nih.gov/taxonomy) to use\n", + "- `preprocess_mode`: This should be set to `\"create-matrix\"` if you are **not** providing a matrix, otherwise set it to `\"provide-matrix\"`" + ], + "id": "abe71b54fcb24765" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "for context in context_names:\n", + " if context not in {*trna_matrix_filepath, *mrna_matrix_filepath}:\n", + " continue\n", + " await rnaseq_preprocess(\n", + " context_name=context,\n", + " taxon=taxon_id,\n", + " como_context_dir=como_context_dir[context],\n", + " input_matrix_filepath=None,\n", + " output_gene_info_filepath=gene_info_filepath[context],\n", + " output_trna_metadata_filepath=trna_metadata_filepath,\n", + " output_trna_count_matrix_filepath=trna_matrix_filepath[context],\n", + " # output_mrna_metadata_filepath=mrna_metadata_filepath,\n", + " # output_mrna_count_matrix_filepath=mrna_matrix_filepath[context],\n", + " cache=True,\n", + " log_level=\"INFO\",\n", + " )" + ], + "id": "fea7f4c6473d2e7a" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "\n", + "## Identification of Gene Activity in Transcriptomic and Proteomic Datasets\n", + "\n", + "This part of Step 1 will identify gene activity in the following data sources:\n", + "- RNA-seq (total, mRNA, or single cell)\n", + "- Proteomics\n", + "\n", + "Only one source is required for model generation, but multiple sources can be helpful for additional validation if they are of high enough quality\n", + "\n", + "### Filtering Raw Counts\n", + "Regardless of normalization technique used, or provided files used for RNA-seq, preprocessing is required to fetch relevent gene information needed for harmonization and normalization such as Entrez ID, and the start and end postions. Currently, COMO can filter raw RNA-sequencing counts using one of the following normalization techniques:\n", + "\n", + "#### Transcripts Per Million Quantile\n", + "TPM Quantile. Each replicate is normalized with Transcripts-Per-Million, and an upper quantiile is taken to create a boolean list of active genes for the replicate (i.e., `R1`). Replicates are compared for consensus within the study, and then studies are compared between one another for additional consensus. The strictness of the consensus easily be set using the appropriate option within the `rnaseq_gen.py` code-block.\n", + "\n", + "This method is recommended if you want more control over the size of the model; smaller models can include only the most expressed reactions, and larger models can encompass less essentail reactions\n", + "\n", + "#### zFPKM\n", + "This method is outlined by [Hart et. al](https://pubmed.ncbi.nlm.nih.gov/24215113/). Counts will be normalized using zFPKM and genes > -3 will be considered \"expressed\" per Hart's recommendation. Expressed genes will be checked for consensus at the replicate and study level.\n", + "\n", + "This method is recommended if you want to less control over which genes are essential, and instead use the most standardized method of active gene determination. This method is more \"hands-off\" than the above TPM Quantile method.\n", + "\n", + "#### Counts Per Million\n", + "This is a flat cutoff value of counts per million normalized values. Gene expression will be checked for consensus at the replicate and study level.\n", + "\n", + "This method is not recommended, as zFPKM is much more robust for a similar level of \"hands-off\" model building\n" + ], + "id": "7588e085dca7726c" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "### RNA Sequencing Analysis\n", + "#### Bulk RNA Sequencing\n", + "\n", + "This has multiple strategies of library preparation (total, polyA-mRNA). If you are using public data, you may encounter a situation where you would like to use a combination of bulk RNA sequencing data produced using two different library preparation strategies.\n", + "\n", + "COMO currently supports the two most common strategies, mRNA polyA enriched RNA sequencing, and total RNA sequencing. Because of the expected differences in distribution of transcripts, COMO is written to handle each strategy seperately before the integration step. The recommended Snakemake alignment pipeline is designed to work with COMO's preprocessing step ([Step 1, above](Step-1:-Initialize-and-Preprocess-RNA-seq-data)) to split RNA sequencing data from GEO into seperate input matrices and configuration files.\n", + "\n", + "To create a gene expression file for total RNA sequencing data, use `\"total\"` for the \"`--library-prep`\" argument.\n", + "To create a gene expression file for mRNA polyA enriched data, use `\"mRNA\"` for the \"`--library-prep`\" argument.\n", + "\n", + "The analysis of each strategy is identical. Specifying the type of analysis (total vs mRNA) only serves to ensure COMO analyzes them seperately.\n", + "\n", + "#### Single Cell RNA Sequencing\n", + "While the Snakemake pipeline does not yet support single-cell alignment, and COMO does not yet support automated configuration file and counts matrix file creation for single-cell alignment output from STAR, it is possible to use single-cell data to create a model with COMO. Because normalization strategies can be applied to single-cell data in the same way it is applied to bulk RNA sequencing, `como/rnaseq_gen.py` can be used with a provided counts matrix and configuration file, from [Step 1](Step-1:-Initialize-and-Preprocess-RNA-seq-data), above. Just like `\"total\"` and `\"mRNA\"`, `como/rnaseq_gen.py` can be executed with `\"SC\"` as the \"`--library-prep`\" argument to help COMO differentiate it from any bulk RNA sequencing data if multiple strategies are being used." + ], + "id": "2c157397f90cdf33" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "\n", + "### Total RNA Sequencing Generation\n", + "#### Parameters\n", + "- `trnaseq_config_file`: The configuration filename for total RNA. This file is found under the `data/config_sheets` folder\n", + "- `rep_ratio`: The proportion of replicates before a gene is considered \"active\" in a study\n", + "- `group_ratio`: The proportion of studies with expression required for a gene to be considered \"active\"\n", + "- `rep_ratio_h`: The proportion of replicates that must express a gene before that gene is considered \"high-confidience\"\n", + "- `group_ratio_h`: The proportion of studies that must express a gene before that gene is considered \"high-confidence\"\n", + "- `technique`: The technique to use. Options are: `\"quantile\"`, `\"cpm\"`, or `\"zfpkm\"`. The difference in these options is discussed above\n", + "- `quantile`: The cutoff Transcripts-Per-Million quantile for filtering\n", + "- `min_zfpkm`: The cutoff for Counts-Per-Million filtering\n", + "- `prep_method`: The library method used for preparation. Options are: `\"total\"`, `\"mRNA\"`, or `\"SC\"`,\n" + ], + "id": "ee2de748ea687a06" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from como.rnaseq_gen import FilteringTechnique, rnaseq_gen\n", + "\n", + "replicate_ratio = 0.75\n", + "high_confidence_replicate_ratio = 1.0\n", + "batch_ratio = 0.75\n", + "high_confidence_batch_ratio = 1.0\n", + "technique = FilteringTechnique.ZFPKM\n", + "cutoff = -3\n", + "\n", + "for context in context_names:\n", + " if context not in trna_matrix_filepath:\n", + " continue\n", + " output_zscore_norm_filepath = Path(get_notebook_dir() / f\"data/results/{context}/z_score_normalization.csv\")\n", + " await rnaseq_gen(\n", + " context_name=context,\n", + " input_rnaseq_filepath=trna_matrix_filepath[context],\n", + " input_gene_info_filepath=gene_info_filepath[context],\n", + " output_boolean_activity_filepath=trna_matrix_filepath[context],\n", + " prep=RNAType.TRNA,\n", + " taxon_id=taxon_id,\n", + " input_metadata_filepath_or_df=trna_metadata_filepath,\n", + " replicate_ratio=replicate_ratio,\n", + " high_replicate_ratio=high_confidence_replicate_ratio,\n", + " batch_ratio=batch_ratio,\n", + " high_batch_ratio=high_confidence_batch_ratio,\n", + " technique=technique,\n", + " cutoff=cutoff,\n", + " output_zscore_normalization_filepath=output_zscore_norm_filepath,\n", + " )" + ], + "id": "16dbbffd7418b149" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## mRNA Sequencing Generation\n", + "These parameters are identical to the ones listed for [total RNA sequencing](#Total-RNA-Sequencing-Generation), but they are listed again here for ease of reference\n", + "\n", + "### Parameters\n", + "- `mrnaseq_config_file`: The configuration filename for total RNA. This file is found under the `data/config_sheets` folder\n", + "- `rep_ratio`: The proportion of replicates before a gene is considered \"active\" in a study\n", + "- `group_ratio`: The proportion of studies with expression required for a gene to be considered \"active\"\n", + "- `rep_ratio_h`: The proportion of replicates that must express a gene before that gene is considered \"high-confidience\"\n", + "- `group_ratio_h`: The proportion of studies that must express a gene before that gene is considered \"high-confidence\"\n", + "- `technique`: The technique to use. Options are: `\"quantile\"`, `\"cpm\"`, or `\"zfpkm\"`. The difference in these options is discussed above\n", + "- `quantile`: The cutoff Transcripts-Per-Million quantile for filtering\n", + "- `min_zfpkm`: The cutoff for Counts-Per-Million filtering\n", + "- `prep_method`: The library method used for preparation. Options are: `\"total\"`, `\"mRNA\"`, or `\"SC\"`,\n" + ], + "id": "381808e882528556" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from como.rnaseq_gen import FilteringTechnique, rnaseq_gen\n", + "\n", + "replicate_ratio = 0.75\n", + "high_confidence_replicate_ratio = 1.0\n", + "batch_ratio = 0.75\n", + "high_confidence_batch_ratio = 1.0\n", + "technique = FilteringTechnique.ZFPKM\n", + "cutoff = -3\n", + "\n", + "for context in context_names:\n", + " if context not in mrna_matrix_filepath:\n", + " continue\n", + " await rnaseq_gen(\n", + " context_name=context,\n", + " input_rnaseq_filepath=mrna_matrix_filepath[context],\n", + " input_gene_info_filepath=gene_info_filepath[context],\n", + " output_boolean_activity_filepath=mrna_matrix_filepath[context],\n", + " prep=RNAType.MRNA,\n", + " taxon_id=taxon_id,\n", + " input_metadata_filepath=mrna_metadata_filepath,\n", + " replicate_ratio=replicate_ratio,\n", + " high_replicate_ratio=high_confidence_replicate_ratio,\n", + " batch_ratio=batch_ratio,\n", + " high_batch_ratio=high_confidence_batch_ratio,\n", + " technique=technique,\n", + " cutoff=cutoff\n", + " )" + ], + "id": "653e0f1c879a3511" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Single-Cell RNA Sequencing Generation\n", + "These parameters are identical to the ones listed for [total RNA sequencing](#Total-RNA-Sequencing-Generation), but they are listed again here for ease of reference\n", + "\n", + "### Parameters\n", + "- `scrnaseq_config_file`: The configuration filename for total RNA. This file is found under the `data/config_sheets` folder\n", + "- `rep_ratio`: The proportion of replicates before a gene is considered \"active\" in a study\n", + "- `group_ratio`: The proportion of studies with expression required for a gene to be considered \"active\"\n", + "- `rep_ratio_h`: The proportion of replicates that must express a gene before that gene is considered \"high-confidience\"\n", + "- `group_ratio_h`: The proportion of studies that must express a gene before that gene is considered \"high-confidence\"\n", + "- `technique`: The only option offered for single-cell RNA sequencing us \"umi\"\n", + "- `quantile`: The cutoff Transcripts-Per-Million quantile for filtering\n", + "- `min_zfpkm`: The cutoff for Counts-Per-Million filtering\n", + "- `prep_method`: The library method used for preparation. Options are: `\"total\"`, `\"mRNA\"`, or `\"scrna\"`,\n" + ], + "id": "76e24f90cd85cd12" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from como.rnaseq_gen import FilteringTechnique, rnaseq_gen\n", + "\n", + "replicate_ratio = 0.75\n", + "high_confidence_replicate_ratio = 1.0\n", + "batch_ratio = 0.75\n", + "high_confidence_batch_ratio = 1.0\n", + "technique = FilteringTechnique.UMI\n", + "cutoff = -3\n", + "\n", + "for context in context_names:\n", + " if context not in scrna_matrix_filepath:\n", + " continue\n", + " await rnaseq_gen(\n", + " context_name=context,\n", + " input_rnaseq_filepath=scrna_matrix_filepath[context],\n", + " input_gene_info_filepath=gene_info_filepath[context],\n", + " output_boolean_activity_filepath=scrna_matrix_filepath[context],\n", + " prep=RNAType.SCRNA,\n", + " taxon_id=taxon_id,\n", + " input_metadata_filepath=Path(\"./data/config_sheets/scrna_config.xlsx\"),\n", + " replicate_ratio=replicate_ratio,\n", + " high_replicate_ratio=high_confidence_replicate_ratio,\n", + " batch_ratio=batch_ratio,\n", + " high_batch_ratio=high_confidence_batch_ratio,\n", + " technique=technique,\n", + " cutoff=cutoff\n", + " )" + ], + "id": "338305e190a0195" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Proteomics Analysis\n", + "The parameters here are mostly the same to total RNA and mRNA sequencing analysis, and are listed here for easier reference\n", + "\n", + "### Parameters\n", + "- `proteomics_config_file`: The file path to the proteomics configuration file\n", + "- `rep_ratio`: The ratio required before a gene is considered active in the replicate\n", + "- `batch_ratio`: The ratio required before a gene is considered active in the study\n", + "- `high_rep_ratio`: The ratio required before a gene is considered \"high-confidence\" in the replicate\n", + "- `high_batch_ratio`: The ratio required before a gene is considered \"high-confidence\" in the study\n", + "- `quantile`: The cutoff Transcripts-Per-Million quantile for filtering" + ], + "id": "bbf5b896eda5357b" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from como.proteomics_gen import proteomics_gen\n", + "\n", + "for context in context_names:\n", + " await proteomics_gen(\n", + " context_name=context_names,\n", + " config_filepath=proteomics_metadata_filepath,\n", + " matrix_filepath=proteomics_matrix_filepath[context],\n", + " output_boolean_filepath=Path(f\"data/results/{context}/proteomics/{context}_proteomics_boolean_matrix.csv\"),\n", + " output_gaussian_png_filepath=Path(f\"data/results/{context}/proteomics/{context}_proteomics_gaussian.png\"),\n", + " output_gaussian_html_filepath=Path(f\"data/results/{context}/proteomics/{context}_proteomics_gaussian.html\"),\n", + " output_z_score_matrix_filepath=Path(f\"data/results/{context}/proteomics/{context}_zscore_matrix.csv\"),\n", + " input_entrez_map=Path(f\"data/results/{context}/proteomics/{context}_entrez_map.csv\"),\n", + " replicate_ratio=0.5,\n", + " batch_ratio=0.5,\n", + " high_confidence_replicate_ratio=0.7,\n", + " high_confidence_batch_ratio=0.7,\n", + " quantile=25,\n", + " )" + ], + "id": "ecf2b6e6c2e27f12" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Cluster Sample Data (Optional)\n", + "This step is used to cluster the samples based on their expression values. This can be used to determine which samples are more similar to each other. In a perfect world, one cluster would be created for each context type used. This is done using the `como/cluster_rnaseq.py` script.\n", + "\n", + "To see more about clustering, please visit the [Wikipedia article](https://en.wikipedia.org/wiki/Cluster_analysis)\n", + "\n", + "The parameters for this script are as follows:\n", + "- `context_names`: The tissue/cell name of models that should be clustered. This was defined in the first code block, so it is not redefined here\n", + "- `filt_technique`: The filtering technique to use; options are: `\"zfpkm\"`, `\"quantile\"`, or `\"cpm\"`\n", + "- `cluster_algorithm`: The clustering algorithm to use. Options are: `\"mca\"` or `\"umap\"`\n", + "- `label`: Should the samples be labeled in the plot? Options are: `\"True\"` or `\"False\"`\n", + "- `min_dist`: The minimum distance for UMAP clustering. Must be between 0 and 1. Default value is 0.01\n", + "- `replicate_ratio`: The ratio of active genes in replicates for a batch/study to be considered active. The default is 0.9\n", + "- `batch_ratio`: The ratio of active genes in batches/studies for a context to be considered active. The default is 0.9\n", + "- `min_count`: The ratio of active genes in a batch/study for a context to be considered active. The default is `\"default\"`\n", + "- `quantile`: The ratio of active genes in a batch/study for a context to be considered active. The default is 0.5\n", + "- `n_neighbors_rep`: N nearest neighbors for replicate clustering. The default is `\"default\"`, which is the total number of replicates\n", + "- `n_neighbors_batch`: N nearest neighbors for batch clustering. The default is `\"default\"`, which is the total number of batches\n", + "- `n_neighbors_context`: N nearest neighbors for context clustering. The default is `\"default\"`, which is the total number of contexts\n", + "- `seed`: The random seed for clustering algorithm initialization. If not specified, `np.random.randint(0, 100000)` is used" + ], + "id": "3a6b4ee9708c405b" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "filt_technique = \"zfpkm\"\n", + "cluster_algorithm = \"umap\"\n", + "label = True\n", + "min_dist = 0.01\n", + "replicate_ratio = 0.9\n", + "batch_ratio = 0.9\n", + "min_count = \"default\"\n", + "quantile = 50\n", + "n_neighbors_rep = \"default\"\n", + "n_neighbors_batch = \"default\"\n", + "n_neighbors_context = \"default\"\n", + "seed = -1\n", + "\n", + "# fmt: off\n", + "cmd = \" \".join(\n", + " [\n", + " \"python3\", \"como/cluster_rnaseq.py\",\n", + " \"--context-names\", context_names,\n", + " \"--filt-technique\", filt_technique,\n", + " \"--cluster-algorithm\", cluster_algorithm,\n", + " \"--label\", label,\n", + " \"--min-dist\", str(min_dist),\n", + " \"--replicate-ratio\", str(replicate_ratio),\n", + " \"--batch-ratio\", str(batch_ratio),\n", + " \"--n-neighbors-rep\", str(n_neighbors_rep),\n", + " \"--n-neighbors-batch\", str(n_neighbors_batch),\n", + " \"--n-neighbors-context\", str(n_neighbors_context),\n", + " \"--min-count\", str(min_count),\n", + " \"--quantile\", str(quantile),\n", + " \"--seed\", str(seed),\n", + " ]\n", + ")\n", + "# fmt: on\n", + "\n", + "!{cmd}" + ], + "id": "ec009ffb9e26ecc6" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "## Merge Expression from Different Data Sources\n", + "\n", + "Thus far, active genes have been determined for at least one data source. If multiple data sources are being used, we can merge the active genes from these sources to make a list of active genes that is more comprehensive (or strict!) than any data source on its own.\n", + "\n", + "`como/merge_xomics.py` takes each data source discussed so far as an argument. The other arguments to consider are:\n", + "- `--expression-requirement`: The number of data sources with expression required for a gene to be considered active, if the gene is not \"high-confidence\" for any source. (default: total number of input sources provided)\n", + "- `--requirement-adjust`: This is used to adjust the expression requirement argument in the event that tissues have a different number of provided data sources. This does nothing if there is only one tissue type in the configuration files.\n", + " - `\"progressive\"`: The expression requirement applies to tissue(s) with the lowest number of data sources. Tissues with more than this value will require its genes to be expressed in 1 additional source before it is \"active\" in the model\n", + " - `\"regressive\"` (default): The expression requirement applies to the tissue(s) with the largest number of data sources. Tissues with less than this value will require its genes to be expressed in 1 fewer sources before the gene is considered \"active\" in the model.\n", + " - `\"flat\"`: The expression requirement is used regardless of differences in the number of data sources provided for different tissues\n", + "\n", + "- `--no-hc`: This flag should be set to prevent high-confidence genes from overriding the expression requirement set.\n", + " - If this flag is not used, any gene that was determined to be \"high-confidence\" in any input source will cause the gene to be active in the final model, regardless of agreement with other sources\n", + "- `--no-na-adjustment`: This flag should be used to prevent genes that are not present in one data source, but are present in others, from subtracting one from the expression requirement.\n", + " - If this flag is not used, any time a gene is \"NA\" in a source, meaning it was not tested for in the library of that data sources but was tested in the library of another source, it will subtract one from the expression requirement.\n", + "\n", + "The adjusted expression requirement will never resolve to be less than one or greater than the number of data sources for a given tissue\n", + "\n", + "### Parameters\n", + "The three parameters listed here were used in RNA Sequencing generation, and should not need to be defined. If you did **not** use one of these, simply un-comment it from the command below by placing a \"`#`\" at the beginning of the appropriate lines\n", + "- `trnaseq_config_file`: The file name used in the [total RNA Sequencing](#Total-RNA-Sequencing-Generation) section of the notebook\n", + "- `mrnaseq_config_file`: The file name used in the [mRNA Sequencing](#mRNA-Sequencing-Generation) section of the notebook\n", + "- `proteomics_config_file`: The file name used in the [proteomics generation](#Proteomics-Analysis) section of the notebook\n", + "\n", + "The following parameters have not been used in a previous section of the notebook, so they are defined in the below code block\n", + "- `expression_requirement`: This is the number of sources a gene must be active in for it to be considered active\n", + "- `requirement_adjust`: The technique to adjust expression requirement based on differences in number of provided data source types\n", + "- `total_rna_weight`: Total RNA-seq weight for merging zFPKM distribution\n", + "- `mrna_weight`: mRNA weight for merging zFPKM distribution\n", + "- `single_cell_weight`: Single-cell weight for merging zFPKM distribution\n", + "- `proteomics_weight`: Proteomic weight for merging zFPKM distribution\n", + "\n", + "Each of the \"weights\" (`total_rna_weight`, `mrna_weight`, etc.) are used to place a significance on each method. Becuase there are many steps in the Dogma from transcription to translation, the gene expression as seen by total RNA or mRNA sequencing may not be representative of the gene's protein expression, and this its metabolic impact. Because of this, you are able to weight each source more (or less) than another." + ], + "id": "52eca3228ef747cd" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from como.merge_xomics import AdjustmentMethod, merge_xomics\n", + "\n", + "total_rna_weight = 6\n", + "mrna_weight = 6\n", + "single_cell_weight = 6\n", + "proteomics_weight = 10\n", + "minimum_source_expression = 1\n", + "expression_requirement = 1\n", + "requirement_adjustment_method = AdjustmentMethod.REGRESSIVE\n", + "force_activate_high_confidence = False\n", + "adjust_for_na_sources = False\n", + "merge_zfpkm_distrubution = True\n", + "keep_transcriptomics_score = True\n", + "\n", + "mrna_batches = {\"naiveB\": [\n", + " \"naiveB_S1R1\", \"naiveB_S1R2\", \"naiveB_S1R3\", \"naiveB_S1R4\",\n", + " \"naiveB_S2R1\", \"naiveB_S2R2\", \"naiveB_S2R3\", \"naiveB_S2R4\",\n", + "]}\n", + "trna_batches = {\"naiveB\": [\"naiveB_S3R1\", \"naiveB_S3R2\", \"naiveB_S3R3\"]}\n", + "\n", + "for context in context_names:\n", + " await merge_xomics(\n", + " context_name=context,\n", + " trna_matrix_or_filepath=trna_matrix_filepath[context],\n", + " mrna_matrix_or_filepath=mrna_matrix_filepath[context],\n", + " scrna_matrix_or_filepath=None, # scrna_matrix_filepath[context],\n", + " proteomic_matrix_or_filepath=None, # proteomics_matrix_filepath[context],\n", + " trna_batches=trna_batches,\n", + " mrna_batches=mrna_batches,\n", + " scrna_batches=None,\n", + " proteomic_batches=None,\n", + " trna_weight=total_rna_weight,\n", + " mrna_weight=mrna_weight,\n", + " scrna_weight=single_cell_weight,\n", + " proteomic_weight=proteomics_weight,\n", + " minimum_source_expression=minimum_source_expression,\n", + " expression_requirement=expression_requirement,\n", + " adjust_method=requirement_adjustment_method,\n", + " force_activate_high_confidence=force_activate_high_confidence,\n", + " adjust_for_na=adjust_for_na_sources,\n", + " merge_zfpkm_distribution=merge_zfpkm_distrubution,\n", + " keep_transcriptomics_score=keep_transcriptomics_score,\n", + " output_merge_activity_filepath=Path(f\"data/results/{context}/ActiveGenes_{context}_Merged.csv\"),\n", + " output_transcriptomic_details_filepath=Path(f\"data/results/{context}/TranscriptomicDetails_{context}.csv\"),\n", + " output_trna_activity_filepath=Path(f\"data/results/{context}/total-rna/trna_activity_{context}.csv\"),\n", + " output_mrna_activity_filepath=Path(f\"data/results/{context}/mrna/mrna_activity_{context}.csv\"),\n", + " output_scrna_activity_filepath=Path(f\"data/results/{context}/scrna/scrna_activity_{context}.csv\"),\n", + " output_proteomic_activity_filepath=Path(f\"data/results/{context}/proteomics/proteomic_activity_{context}.csv\"),\n", + " output_final_model_scores_filepath=Path(f\"data/results/{context}/model_scores_{context}.csv\"),\n", + " output_figure_dirpath=Path(f\"data/results/{context}/figures\")\n", + " )" + ], + "id": "d1a0a578865ea4a7" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Step 2: Create Tissue/Cell-Type Specific Models\n", + "\n", + "## Boundary Reactions\n", + "To create a metabolic model, the following information about each metabolite or reaction involved is required:\n", + "- **Reaction Type**\n", + " - Exchange\n", + " - Demand\n", + " - Sink\n", + "- **Metabolic/Reaction Abbreviation**\n", + " - You can use the [Virutal Metabolic Human](https://www.vmh.life/#home) to look up your metabolite and reaction abbreviations\n", + "- **Compartments**\n", + " - Cytosol\n", + " - Extracellular\n", + " - Golgi Apparatus\n", + " - Internal Membranes\n", + " - Lysosome\n", + " - Mitochondria\n", + " - Nucleus\n", + " - Endoplasmic Reticulum\n", + " - Unknown\n", + "- **Minimum Reaction Rate**\n", + "- **Maximum Reaction Rate**\n", + "\n", + "\n", + "*Below is an example of a properly formatted table of metabolic and reaction information*\n", + "\n", + "| Reaction | Abbreviation | Compartment | Minimum Reaction Rate | Maximum Reaction Rate |\n", + "|:--------:|:------------:|:------------------:|:---------------------:|:---------------------:|\n", + "| Exchange | glc_D | Extracellular | -100 | 1000 |\n", + "| Demand | 15HPETATP | Cytosol | -1 | 1000 |\n", + "| Sink | met_L | Internal Membranes | -1000 | 1 |\n", + "\n", + "\n", + "These reactions should be placed into a CSV file; a template can be found at `data/boundary_rxns/default_force_rxns.csv`. Append your reactions to this file, and remove any that are not required. COMO will load this file in during model creation\n", + "\n", + "## Force Reactions\n", + "Force reactions are reactions that should **always** be included in the model, no matter their flux value in the metabolic data provided. In contrast to the boundary reaction list, this is simply a list of reaction names that should be \"forced\" through the model. Append your force reactions to the `data/force_rxns/default_force_rxns.csv` file, and remove any that are not required. COMO will load this file during model creation\n", + "\n", + "*Below is an example of a properly formatted table of force reactions*\n", + "\n", + "| Reaction |\n", + "|:--------:|\n", + "| glc_D |\n", + "| met_L |\n", + "\n", + "## Adding Reference Models\n", + "This Jupyter notebook uses Recon3D's [Virtual Metabolic Human](https://www.vmh.life/) as a base to map reactions onto, and is included with the Jupyter notebook. If you would like to include other reference models, simply upload them to the `data` folder, and set the name of the `general_model_file` below to the name of your reference model.\n", + "\n", + "## Parameters\n", + "The following is a list of parameters and their function in this section of the pipeline\n", + "- `low_thres`: If you are using the `IMAT` reconstruction algorithm, gene expression above this value will be placed in the \"mid-expression\" bin\n", + "- `high_thres`: If you are using the `IMAT` reconstruction algorithm, gene expression above this value will be placed in the \"high-expression\" bin\n", + "- `output_filetypes`: These are the file types you would like to save your model as. It should be one (or multiple) of the following: `\"xml\"`, `\"mat\"`, `\"json\"`\n", + "- `objective_dict`: This is an objective the model should be solved for. Popular options are `\"biomass_reaction\"` or `\"biomass_maintenance\"`\n", + "- `general_model_file`: This is the reference model file to load\n", + "- `recon_algorithm`: The troppo reconstruction algorithm to use. This should be one of the following: `\"FastCORE\"`, `\"CORDA\"`, `\"GIMME\"`, `\"tINIT\"`, `\"IMAT\"`\n", + "- `solver`: The solver to use for optimizing the model. Options are: `\"GUROBI\"` or `\"GLPK\"`\n", + "- `boundary_reactions_filename`: The filename of boundary reactions that should be used\n", + "- `force_reactions_filename`: The filename of the force reactions to be used. Force reactions will (as the name implies) force the optimizer to use these reactions, **no matter their expression**\n", + "- `exclude_reactions_filename`: The filename of reactions to exclude from the model, no matter their expression" + ], + "id": "aff121ae1e86071b" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "import json\n", + "from pathlib import Path\n", + "\n", + "from como.project import Config\n", + "\n", + "# Set your objectives before running!\n", + "objective_dict = {\"naiveB\": \"biomass_maintenance\", \"smB\": \"biomass_maintenance\"}\n", + "# -----------------\n", + "\n", + "low_threshold = -5\n", + "high_threshold = -3\n", + "output_filetypes = \"xml mat json\"\n", + "general_model_file = \"GeneralModelUpdatedV2.mat\"\n", + "recon_algorithms = [\"IMAT\"]\n", + "solver = \"GUROBI\"\n", + "\n", + "config = Config()\n", + "\n", + "# Load the output of step 1, which is a dictionary that specifies the merged list of active Gene IDs for each tissue\n", + "step1_results_file = config.data_dir / \"results\" \"step1_results_files.json\"\n", + "with step1_results_file.open(\"r\") as json_file:\n", + " context_gene_exp = json.load(json_file)\n", + "\n", + "for recon_algorithm in recon_algorithms:\n", + " for context in context_gene_exp:\n", + " objective = objective_dict[context]\n", + "\n", + " if recon_algorithm.upper() in [\"IMAT\", \"TINIT\"]:\n", + " active_genes_filepath = config.data_dir / \"results\" / context / f\"model_scores_{context}.csv\"\n", + " else:\n", + " gene_expression_file = context_gene_exp[context]\n", + " active_genes_filename = Path(gene_expression_file).name\n", + " active_genes_filepath = config.data_dir / \"results\" / context / active_genes_filename\n", + "\n", + " general_model_filepath = config.data_dir / \"GeneralModelUpdatedV2.mat\"\n", + " boundary_reactions_filepath = config.data_dir / \"boundary_rxns\" / f\"{context}_boundary_rxns.csv\"\n", + " force_reactions_filepath = config.data_dir / \"force_rxns\" / f\"{context}_force_rxns.csv\"\n", + " exclude_reactions_filepath = config.data_dir / \"exclude_rxns\" / f\"{context}_exclude_rxns.csv\"\n", + "\n", + " # fmt: off\n", + " cmd = \" \".join(\n", + " [\n", + " \"python3\", \"como/create_context_specific_model.py\",\n", + " \"--context\", context,\n", + " \"--reference-model-filepath\", general_model_filepath,\n", + " \"--active-genes-filepath\", active_genes_filepath,\n", + " \"--objective\", objective,\n", + " \"--boundary-reactions-filepath\", boundary_reactions_filepath,\n", + " # \"--exclude-reactions-filepath\", exclude_reactions_filepath,\n", + " \"--force-reactions-filepath\", force_reactions_filepath,\n", + " \"--algorithm\", recon_algorithm,\n", + " \"--low-threshold\", str(low_threshold),\n", + " \"--high-threshold\", str(high_threshold),\n", + " \"--solver\", solver,\n", + " \"--output-filetypes\", output_filetypes,\n", + " ]\n", + " )\n", + " # fmt: on\n", + " !{cmd}" + ], + "id": "919e11a52790c5b" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Generate MEMOTE Reports\n", + "> NOTE: This step is entirely optional\n", + "\n", + "MEMOTE is an open-source tool to automate the testing and reporting of metabolic models. This report is a detailed summary of the tests performed by MEMOTE on a given metabolic model (i.e., the one you just generated), along with the results and recommendations for improving the model. In order to create these reports, a metabolic \"map\" is required. Several of these are included in COMO, found under `data/maps/RECON1`. If you would like to add your own maps, they can be included in multiple places:\n", + "1. If you have mapped a `local_files` directory to the container before starting, you can simply copy-and-paste them into the `local_files/maps` directory using the file browser of your computer. This is the most robust solution because the files will not be deleted by the container after it stops, or if it is updated in the future\n", + "2. You can upload them to the Jupyter notebook under the `data/maps` directory. The code block below will search for any `.json` files that are not already included in the `map_dict` dictionary\n", + "\n", + "config.data_dir,\n", + " \"results\",\n", + " context,\n", + " \"figures\",\n", + " f\"{key}_map_{context}_{algorithm}.html\"\n", + "\n", + "The resulting MEMOTE reports will be saved to `data/results/exampleTissue/figures/mapName_map_exampleTissue_ALGORITHM.html`.\n", + "\n", + "- `mapName`: This is the name of the map file. In the `map_dict` dictionary below, this value would be `trypto`, `retinol`, etc.\n", + "- `exampleTissue`: This is the name of the tissue context\n", + "- `ALGORITHM`: This is the algorithm (`recon_algorithm`) used in the above model creation step\n" + ], + "id": "a039b13e7cfb5c45" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "import cobra\n", + "from como.project import Config\n", + "from escher import Builder\n", + "from loguru import logger\n", + "\n", + "config = Config()\n", + "\n", + "user_map_dir = config.data_dir / \"local_files\" / \"maps\"\n", + "map_dict = {\n", + " \"trypto\": f\"{config.data_dir}/maps/RECON1/RECON1.tryptophan_metabolism.json\",\n", + " # \"lipid\": f\"{config.data_dir}/maps/RECON1/RECON1.\", # Not present in COMO by default yet\n", + " \"retinol\": f\"{config.data_dir}/maps/RECON1/RECON1.inositol_retinol_metabolism.json\",\n", + " \"glyco\": f\"{config.data_dir}/maps/RECON1/RECON1.glycolysis_TCA_PPP.json\",\n", + " \"combined\": f\"{config.data_dir}/maps/RECON1/RECON1.combined.json\",\n", + " \"carbo\": f\"{config.data_dir}/maps/RECON1/RECON1.carbohydrate_metabolism.json\",\n", + " \"amino\": f\"{config.data_dir}/maps/RECON1/RECON1.amino_acid_partial_metabolism.json\",\n", + "}\n", + "\n", + "# Collect files from user-input json maps\n", + "index = 1\n", + "for file in user_map_dir.glob(\"**/*.json\"):\n", + " map_dict[file.stem] = file\n", + " index += 1\n", + "\n", + "# Collect any additional maps under the `{config.data_dir}/maps/` directory\n", + "for file in (config.data_dir / \"maps\").glob(\"**/*.json\"):\n", + " if file not in map_dict.values():\n", + " map_dict[file.stem] = file\n", + "\n", + "for recon_algorithm in recon_algorithms:\n", + " for context in context_gene_exp:\n", + " model_json = config.data_dir / \"results\" / context / f\"{context}_SpecificModel_{recon_algorithm}.json\"\n", + "\n", + " logger.info(f\"Loading '{context}', this may take some time...\")\n", + " model = cobra.io.load_json_model(model_json)\n", + " for key in map_dict:\n", + " logger.info(f\"Running with: {key}\")\n", + " builder = Builder(map_json=str(map_dict[key]))\n", + " builder.model = model\n", + " solution = cobra.flux_analysis.pfba(model)\n", + " builder.reaction_data = solution.fluxes\n", + " builder.reaction_scale = [\n", + " {\"type\": \"min\", \"color\": \"#ff3300\", \"size\": 12},\n", + " {\"type\": \"q1\", \"color\": \"#ffc61a\", \"size\": 14},\n", + " {\"type\": \"median\", \"color\": \"#ffe700\", \"size\": 16},\n", + " {\"type\": \"q3\", \"color\": \"#4ffd3c\", \"size\": 18},\n", + " {\"type\": \"max\", \"color\": \"#3399ff\", \"size\": 20},\n", + " ]\n", + " builder.reaction_no_data_color = \"#8e8e8e\"\n", + "\n", + " builder.save_html(\n", + " config.data_dir / \"results\" / context / \"figures\" / f\"{key}_map_{context}_{recon_algorithm}.html\"\n", + " )\n", + "\n", + " out_dir = config.data_dir / \"results\" / context\n", + " report_file = out_dir / f\"memote_report_{context}_{recon_algorithm}.html\"\n", + " model_file = out_dir / f\"{context}_SpecificModel_{recon_algorithm}.xml\"\n", + " log_dir = out_dir / \"memote\"\n", + " log_file = log_dir / f\"{context}_{recon_algorithm}_memote.log\"\n", + "\n", + " if not log_dir.exists():\n", + " log_dir.mkdir(parents=True, exist_ok=True)\n", + "\n", + " cmd = \" \".join(\n", + " [\"memote\", \"report\", \"snapshot\", \"--filename\", f\"{report_file}\", f\"{model_file}\", \">\", f\"{log_file}\"]\n", + " )\n", + "\n", + " !{cmd}" + ], + "id": "8993ff06e53c44ac" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Step 3: Disease-related Gene Identification\n", + "This step can identify disease related genes by analyzing patient transcriptomics' data\n", + "\n", + "In the `data/config_sheets` folder, create another folder called `disease`. Add an Excel file for each tissue/cell type called `disease_data_inputs_`, where `` is the name of the tissue you are interested in. Each sheet of this file should correspond to a separate disease to analyze using differential gene analysis. The file is formatted in the same fashion as described in the [final part of Step 1](#Importing-a-Pre-Generated-Counts-Matrix). The sheet names should be in the following format: `_bulk`\n", + "- ``: This is the name of the disease you are analyzing.\n", + "\n", + "For example, if the disease we are interested in is lupus, and the source of the data is bulk RNA sequencing, the name of the first sheet would be `lupus_bulk`. If you are using bulk RNA sequencing, there should be a gene counts matrix file located at `data/data_matrices//` called `BulkRNAseqDataMatrix_`\n", + "\n", + "## Parameters\n", + "- `disease_names`: The diseases you are using. This should match the first section of the sheet name in the Excel file\n", + "- `data_source`: The datasource you are using for disease analysis. This should be`\"rnaseq\"`\n", + "- `taxon_id`: The [NCBI Taxon ID](https://www.ncbi.nlm.nih.gov/taxonomy) to use for disease analysis" + ], + "id": "b86824241ca2c785" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "from como.utils import stringlist_to_list\n", + "\n", + "disease_names = [\"arthritis\", \"lupus_a\", \"lupus_b\"]\n", + "data_source = \"rnaseq\"\n", + "taxon_id = \"human\"\n", + "\n", + "for context_name in stringlist_to_list(context_names):\n", + " disease_config_file = f\"disease_data_inputs_{context_name}.xlsx\"\n", + "\n", + " # fmt: off\n", + " cmd = \" \".join(\n", + " [\n", + " \"python3\", \"como/disease_analysis.py\",\n", + " \"--context-name\", context_name,\n", + " \"--config-file\", disease_config_file,\n", + " \"--data-source\", data_source,\n", + " \"--taxon-id\", str(taxon_id),\n", + " ]\n", + " )\n", + " # fmt: on\n", + "\n", + " !{cmd}" + ], + "id": "a0e6b3bc29a2ee4c" + }, + { + "metadata": {}, + "cell_type": "markdown", + "source": [ + "# Step 4: Drug Targets & Repurposable Drug Identification\n", + "This step performs a series of tasks:\n", + "1. Map drug targets in metabolic models\n", + "2. Performs knock out simulations\n", + "3. Compares simulation results with \"disease genes\"\n", + "4. Identifies drug targets and repurposable drugs\n", + "\n", + "## Execution Steps\n", + "### Drug Database\n", + "A processed drug-target file is included in the `data` folder, called `Repurposing_Hub_export.txt`. If you would like to include an additional drug-target file, please model your own file after the included one. Alternatively, if you would like to update to a newer version of the database, simply export from the [Drug Repurposing Hub](https://clue.io/repurposing-app). If you do this, remove all `activators`, `agonists`, and `withdrawn` drugs. Replace the `data/Repurposing_Hub_export.txt` file.\n", + "\n", + "### Using Automatically Created Models\n", + "This step will use the models generated in Step 4, above. It is **highly** recommended to use refined and validated models for further analysis (i.e., before running this step of the pipeline). If you would like to use a custom model, instead of the one created by COMO, edit the `model_files` dictionary. An example is shown here:\n", + "```python\n", + "model_files = {\n", + " \"exampleTissueModel\": \"/home/jovyan/main/data/myModels/exampleTissueModel.mat\",\n", + " \"anotherTissueModel\": \"/home/jovyan/main/data/myModels/anotherTissueModel.json\",\n", + " \"thirdTissueModel\": \"/home/jovyan/main/data/myModels/thirdTissueModel.xml\"\n", + "}\n", + "```\n", + "\n", + "❗The path `/home/jovyan/main/` **MUST** stay the same. If it does not, your model **will not be found**\n", + "\n", + "\n", + "## Parameters\n", + "Other than the `model_files` parameter (if required), the only other parameter for this section is the `solver` option\n", + "\n", + "- `solver`: The solver you would like to use. Available options are `\"gurobi\"` or `\"glpk\"`\n" + ], + "id": "9898d75ab36fe5d7" + }, + { + "metadata": {}, + "cell_type": "code", + "outputs": [], + "execution_count": null, + "source": [ + "import json\n", + "\n", + "from como.project import Config\n", + "from como.utils import stringlist_to_list\n", + "from loguru import logger\n", + "\n", + "# Knock out simulation for the analyzed tissues and diseases\n", + "model_files = {\n", + " # \"context_name\": \"/path/to/model.mat\"\n", + " # EXAMPLE -> \"Treg\": \"/home/jovyan/main/data/results/naiveB/naiveB_SpecificModel_IMAT.mat\"\n", + "}\n", + "sovler = \"gurobi\"\n", + "\n", + "config = Config()\n", + "\n", + "drug_raw_file = \"Repurposing_Hub_export.txt\"\n", + "for context in stringlist_to_list(context_names):\n", + " for recon_algorithm in recon_algorithms:\n", + " for disease in disease_names:\n", + " disease_path = config.data_dir / \"results\" / context / disease\n", + " out_dir = config.data_dir / \"results\" / context / disease\n", + " tissue_gene_folder = config.data_dir / context\n", + " tissue_gene_folder.mkdir(parents=True, exist_ok=True)\n", + "\n", + " if not disease_path.exists():\n", + " logger.warning(f\"Disease path doesn't exist! Looking for {disease_path}\")\n", + " continue\n", + "\n", + " # load the results of step 3 to dictionary \"disease_files\"\n", + " step3_results_file = config.data_dir / \"results\" / context / disease / \"step2_results_files.json\"\n", + "\n", + " with step3_results_file.open(\"r\") as json_file:\n", + " disease_files = json.load(json_file)\n", + " down_regulated_disease_genes = disease_files[\"down_regulated\"]\n", + " up_regulated_disease_genes = disease_files[\"up_regulated\"]\n", + "\n", + " if context in model_files:\n", + " tissue_specific_model_filepath = model_files[context]\n", + " else:\n", + " tissue_specific_model_filepath = (\n", + " config.data_dir / \"results\" / context / f\"{context}_SpecificModel_{recon_algorithm}.mat\"\n", + " )\n", + "\n", + " # fmt: off\n", + " cmd = [\n", + " \"python3\", \"como/knock_out_simulation.py\",\n", + " \"--context-model\", tissue_specific_model_filepath,\n", + " \"--context-name\", context,\n", + " \"--disease-name\", disease,\n", + " \"--disease-up\", up_regulated_disease_genes,\n", + " \"--disease-down\", down_regulated_disease_genes,\n", + " \"--raw-drug-file\", drug_raw_file,\n", + " \"--solver\", sovler,\n", + " # \"--test-all\"\n", + " ]\n", + " # fmt: on\n", + "\n", + " if recon_algorithm == \"IMAT\":\n", + " cmd.extend([\"--reference-flux-file\", config.data_dir / \"results\" / context / \"IMAT_flux.csv\"])\n", + "\n", + " cmd = \" \".join(cmd)\n", + " !{cmd}" + ], + "id": "7b79be19d1f47c81" + } + ], + "metadata": {}, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/main/como/__init__.py b/main/como/__init__.py new file mode 100644 index 00000000..cba3d35f --- /dev/null +++ b/main/como/__init__.py @@ -0,0 +1,14 @@ +from como import plot +from como.data_types import AdjustmentMethod, Algorithm, CobraCompartments, FilteringTechnique, LogLevel, Solver +from como.utils import stringlist_to_list + +__all__ = [ + "AdjustmentMethod", + "Algorithm", + "CobraCompartments", + "FilteringTechnique", + "LogLevel", + "Solver", + "stringlist_to_list", +] +__version__ = "1.11.1" diff --git a/main/como/cluster_rnaseq.py b/main/como/cluster_rnaseq.py new file mode 100644 index 00000000..1078b926 --- /dev/null +++ b/main/como/cluster_rnaseq.py @@ -0,0 +1,226 @@ +from __future__ import annotations + +import argparse +from curses.ascii import isdigit +from dataclasses import dataclass +from typing import Any + +import numpy as np + +from como.data_types import LogLevel +from como.utils import _log_and_raise_error, stringlist_to_list + + +@dataclass +class _Arguments: + context_names: list[str] + filtering_technique: str + cluster_algorithm: str + label_plot: str + min_distance: float + replicate_ratio: Any + batch_ratio: Any + num_replicate_neighbors: Any + num_batch_neighbors: Any + num_context_neighbors: Any + min_active_count: int | str + quantile: Any + seed: int + + def __post_init__(self): # noqa: C901, ignore too complex + self.filtering_technique = self.filtering_technique.lower() + self.cluster_algorithm = self.cluster_algorithm.lower() + + if self.seed == -1: + self.seed = np.random.randint(0, 100_000) + + if (isdigit(self.min_active_count) and int(self.min_active_count) < 0) or self.min_active_count != "default": + _log_and_raise_error( + "min_active_count must be either 'default' or an integer > 0", + error=ValueError, + level=LogLevel.ERROR, + ) + + if (isdigit(self.quantile) and 0 > int(self.quantile) > 100) or self.quantile != "default": + _log_and_raise_error( + "quantile must be either 'default' or an integer between 0 and 100", + error=ValueError, + level=LogLevel.ERROR, + ) + + if (isdigit(self.replicate_ratio) and 0 > self.replicate_ratio > 1.0) or self.replicate_ratio != "default": + _log_and_raise_error( + "--rep-ratio must be either 'default' or a float between 0 and 1", + error=ValueError, + level=LogLevel.ERROR, + ) + + if (isdigit(self.batch_ratio) and 0 > self.batch_ratio > 1.0) or self.batch_ratio != "default": + _log_and_raise_error( + "--batch-ratio must be either 'default' or a float between 0 and 1", + error=ValueError, + level=LogLevel.ERROR, + ) + + if self.filtering_technique.lower() not in {"quantile", "tpm", "cpm", "zfpkm"}: + _log_and_raise_error( + "--technique must be either 'quantile', 'tpm', 'cpm', 'zfpkm'", + error=ValueError, + level=LogLevel.ERROR, + ) + + if self.filtering_technique.lower() == "tpm": + self.filtering_technique = "quantile" + + if self.cluster_algorithm.lower() not in {"mca", "umap"}: + _log_and_raise_error( + "--clust_algo must be either 'mca', 'umap'", + error=ValueError, + level=LogLevel.ERROR, + ) + + if 0 > self.min_distance > 1.0: + _log_and_raise_error( + "--min_dist must be a float between 0 and 1", + error=ValueError, + level=LogLevel.ERROR, + ) + + if (isdigit(self.num_replicate_neighbors) and self.num_replicate_neighbors < 1) or self.num_replicate_neighbors != "default": + _log_and_raise_error( + "--n-neighbors-rep must be either 'default' or an integer > 1", + error=ValueError, + level=LogLevel.ERROR, + ) + + if (isdigit(self.num_batch_neighbors) and self.num_batch_neighbors < 1) or self.num_batch_neighbors != "default": + _log_and_raise_error( + "--n-neighbors-batch must be either 'default' or an integer > 1", + error=ValueError, + level=LogLevel.ERROR, + ) + + if (isdigit(self.num_context_neighbors) and self.num_context_neighbors < 1) or self.num_context_neighbors != "default": + _log_and_raise_error( + "--n-neighbors-context must be either 'default' or an integer > 1", + error=ValueError, + level=LogLevel.ERROR, + ) + + +def _parse_args() -> _Arguments: + parser = argparse.ArgumentParser( + prog="cluster_rnaseq.py", + description="Cluster RNA-seq Data using Multiple Correspondence Analysis or UMAP. Clusters at the replicate, " + "batch/study, and context levels.", + epilog="For additional help, please post questions/issues in the MADRID GitHub repo at " + "https://github.com/HelikarLab/MADRID or email babessell@gmail.com", + ) + parser.add_argument( + "--context-names", + type=str, + required=True, + dest="context_names", + help="Tissue/cell name of models to generate", + ) + parser.add_argument( + "--filtering-technique", + type=str, + required=True, + dest="filtering_technique", + help="'zfpkm', 'quantile', or 'cpm'", + ) + parser.add_argument( + "--cluster-algorithm", + type=str, + required=False, + default="umap", + dest="cluster_algorithm", + help="Clustering algorithm to use. 'mca' or 'umap'.", + ) + parser.add_argument( + "--label-plot", + type=str, + required=False, + default=True, + dest="label_plot", + help="Set to True to label replicate/batch/context names on the plots. May be ugly for large sets", + ) + parser.add_argument( + "--min-distance", + type=float, + required=False, + default=0.01, + dest="min_distance", + help="Minimum distance for UMAP clustering. Must be between 0 and 1", + ) + parser.add_argument( + "-r", + "--replicate-ratio", + type=str, + required=False, + default=0.9, + dest="replicate_ratio", + help="Ratio of genes active in replicates for a batch/study to be active", + ) + parser.add_argument( + "-b", + "--batch-ratio", + type=str or float, + required=False, + default=0.9, + dest="batch_ratio", + help="Ratio of genes active in a batch/study to be active in the context", + ) + parser.add_argument( + "--num-replicate-neighbors", + type=str or float, + required=False, + default="default", + dest="num_replicate_neighbors", + help="N nearest neighbors for replicate clustering, 'default' is total number of replicates", + ) + parser.add_argument( + "-nb", + "--num-batch-neighbors", + type=str or float, + required=False, + default="default", + dest="num_batch_neighbors", + help="N nearest neighbors for batch clustering, 'default' is total number of batches", + ) + parser.add_argument( + "--num-context-neighbors", + type=str or float, + required=False, + default="default", + dest="num_context_neighbors", + help="N nearest neighbors for context clustering, 'default' is total number of contexts", + ) + parser.add_argument( + "--min-active-count", + type=str or int, + required=False, + default="default", + dest="min_active_count", + help="Ratio of active genes in a batch/study to be active in the context", + ) + parser.add_argument( + "--quantile", + type=str or int, + required=False, + default=0.5, + dest="quantile", + help="Ratio of active genes in a batch/study to be active in the context", + ) + parser.add_argument( + "--seed", + type=int, + required=False, + default=-1, + dest="seed", + help="Random seed for clustering algorithm initialization", + ) + args = parser.parse_args() + args.context_names = stringlist_to_list(args.context_names) + return _Arguments(**vars(args)) diff --git a/main/como/cluster_sources.py b/main/como/cluster_sources.py new file mode 100644 index 00000000..c9e91dbd --- /dev/null +++ b/main/como/cluster_sources.py @@ -0,0 +1,22 @@ +from pathlib import Path + +import rpy2_api + +# read and translate R functions +r_file_path = Path("./rscripts/cluster_sources.R") +results_dir = "/Users/joshl/docker/madrid/local_files/results" +context_names = ["immNK", "naiveB"] +source_type = "zFPKM" +use_trna = True +use_mrna = True +binarize_data = True + +cluster_sources = rpy2_api.Rpy2( + r_file_path=r_file_path, + results_dir=results_dir, + context_names=context_names, + source_type=source_type, + use_trna=use_trna, + use_mrna=use_mrna, + binarize_data=binarize_data, +).call_function("cluster_sources_main") diff --git a/main/como/combine_distributions.py b/main/como/combine_distributions.py new file mode 100644 index 00000000..17349afe --- /dev/null +++ b/main/como/combine_distributions.py @@ -0,0 +1,284 @@ +from __future__ import annotations + +from pathlib import Path + +import numpy as np +import pandas as pd +from loguru import logger + +from como.data_types import ( + BatchNames, + CombineOmicsInput, + GeneIdentifier, + InputMatrices, + OutputCombinedSourceFilepath, + SourceTypes, + SourceWeights, + _BatchEntry, +) + +# from como.plot.z_score import z_score_distribution +from como.utils import num_columns + + +def _combine_z_distribution_for_batch( + context_name: str, + batch: _BatchEntry, + matrix: pd.DataFrame, + source: SourceTypes, + output_combined_matrix_filepath: Path, + output_figure_dirpath: Path, + weighted_z_floor: int, + weighted_z_ceiling: int, +) -> pd.DataFrame: + """Combine z-score distributions across samples for a single batch. + + Args: + context_name: Name of the context (e.g., tissue or condition). + batch: Batch entry containing batch number and sample names. + matrix: DataFrame with 'ensembl_gene_id' and sample columns. + source: Source type (e.g., trna, mrna, scrna, proteomics). + output_combined_matrix_filepath: Path to save the combined z-score matrix. + output_figure_dirpath: Path to save the z-score distribution figure. + weighted_z_floor: Minimum z-score value after combining. + weighted_z_ceiling: Maximum z-score value after combining. + + Returns: + A pandas dataframe of the weighted z-distributions + """ + output_combined_matrix_filepath.parent.mkdir(parents=True, exist_ok=True) + output_figure_dirpath.mkdir(parents=True, exist_ok=True) + + logger.trace( + f"Combining z-score distributions: batch #{batch.batch_num}, " + f"samples: {len(batch.sample_names)}, " + f"source: '{source.value}', " + f"context: '{context_name}'" + ) + if num_columns(matrix) < 2: + logger.trace(f"A single sample exists for batch '{batch.batch_num}'. Returning as-is because no additional combining can be done") + return matrix + + weighted_matrix = np.sum(matrix, axis=1) / np.sqrt(matrix.shape[1]) + weighted_matrix = np.clip(weighted_matrix, weighted_z_floor, weighted_z_ceiling).astype(np.int8) + + # merge_df = pd.concat([matrix, pd.Series(weighted_matrix, name="combined")], axis=1) + weighted_matrix = pd.DataFrame({"combine_z": weighted_matrix}, index=matrix.index) + + # stack_df = pd.melt( + # merge_df, + # id_vars=["ensembl_gene_id"], + # # Get all columns except gene identifier items (ensembl gene id, gene symbol, etc) + # value_vars=[col for col in merge_df.columns if col not in GeneIdentifier._member_map_], + # var_name="source", + # value_name="zscore", + # ) + # if len(stack_df["source"].unique()) > 10: + # stack_df = stack_df[stack_df["source"] == "combined"] + # z_score_distribution( + # stack_df, + # title=f"Combined Z-score Distribution for {context_name} - batch #{batch.batch_num}", + # output_filepath=output_figure_dirpath + # / f"{context_name}_{source.value}_batch{batch.batch_num}_combined_zscore_distribution.pdf", + # ) + + weighted_matrix.columns = [batch.batch_num] + weighted_matrix.to_csv(output_combined_matrix_filepath, index=False) + return weighted_matrix + + +def _combine_z_distribution_for_source( + merged_source_data: pd.DataFrame, + context_name: str, + num_replicates: int, + output_combined_matrix_filepath: Path, + output_figure_filepath: Path, + weighted_z_floor: int = -6, + weighted_z_ceiling: int = 6, +) -> pd.DataFrame: + """Combine z-score distributions across batches for a single source. + + Args: + merged_source_data: DataFrame with 'ensembl_gene_id' and batch columns. + context_name: Name of the context (e.g., tissue or condition). + num_replicates: Number of replicates (samples) for weighting. + output_combined_matrix_filepath: Path to save the combined z-score matrix. + output_figure_filepath: Path to save the z-score distribution figure. + weighted_z_floor: Minimum z-score value after combining. + weighted_z_ceiling: Maximum z-score value after combining. + + Returns: + A pandas dataframe of the weighted z-distributions + """ + if num_columns(merged_source_data) <= 2: + logger.warning("A single source exists, returning matrix as-is because no additional combining can be done") + merged_source_data.columns = ["ensembl_gene_id", "combine_z"] + return merged_source_data + + output_combined_matrix_filepath.parent.mkdir(parents=True, exist_ok=True) + output_figure_filepath.parent.mkdir(parents=True, exist_ok=True) + + logger.trace(f"Found {num_columns(merged_source_data) - 1} samples for context '{context_name}' to combine") + values = merged_source_data.iloc[:, 1:].values + mask = ~np.isnan(values) + masked_values = np.where(mask, values, 0) # Replace NaN with 0 + masked_num_replicates = np.where(mask, num_replicates, 0) + + weights = masked_num_replicates / np.sum(masked_num_replicates, axis=1, keepdims=True) + numerator = np.sum(weights * masked_values, axis=1) + denominator = np.sqrt(np.sum(weights**2, axis=1)) + weighted_matrix = numerator / denominator + weighted_matrix = np.clip(weighted_matrix, weighted_z_floor, weighted_z_ceiling) + logger.trace("Finished combining z-distribution") + # merge_df = pd.concat([merged_source_data, pd.Series(weighted_matrix, name="combined")], axis=1) + weighted_matrix = pd.DataFrame({"combine_z": weighted_matrix}, index=merged_source_data.index) + + # stack_df = pd.melt( + # merge_df, + # id_vars=["ensembl_gene_id"], + # value_vars=merge_df.columns[1:], # all other columns are values + # var_name="source", + # value_name="zscore", + # ) + # graph_zscore_distribution( + # df=stack_df, + # title=f"Combined Z-score Distribution for {context_name}", + # output_filepath=output_figure_filepath, + # ) + return weighted_matrix + + +def _combine_z_distribution_for_context( + context: str, + zscore_results: list[CombineOmicsInput], + output_graph_filepath: Path, + weighted_z_floor: int = -6, + weighted_z_ceiling: int = 6, +): + if not zscore_results: + logger.warning("No zscore results exist, returning empty dataframe") + return pd.DataFrame({"ensembl_gene_id": [], "combine_z": []}) + + z_matrices = [res.z_score_matrix.rename(columns=dict.fromkeys(res.z_score_matrix.columns, res.type.value)) for res in zscore_results] + z_matrix = pd.concat(z_matrices, axis=1, join="outer").reset_index() + if num_columns(z_matrix) <= 1: + logger.trace(f"Only 1 source exists for '{context}', returning dataframe as-is becuase no data exists to combine") + z_matrix.columns = ["ensembl_gene_id", "combine_z"] + return z_matrix + + values = z_matrix.iloc[:, 1:].values + weights = np.array([r.weight for r in zscore_results]) + mask = ~np.isnan(values) + masked_values = np.where(mask, values, 0) + masked_weights = np.where(mask, weights, 0) + + normalized_weights = masked_weights / np.sum(masked_weights, axis=1, keepdims=True) + numerator = np.sum(normalized_weights * masked_values, axis=1) + denominator = np.sqrt(np.sum(normalized_weights**2, axis=1)) + combined_z_matrix = numerator / denominator + combined_z_matrix = np.clip(combined_z_matrix, weighted_z_floor, weighted_z_ceiling) + combined_z_matrix_df = pd.DataFrame( + {"combine_z": combined_z_matrix}, + index=z_matrix.index, + ) + + stack_df = pd.melt( + z_matrix, + id_vars=["ensembl_gene_id"], + value_vars=z_matrix.columns[1:], + var_name="source", + value_name="zscore", + ) + combined_df = pd.DataFrame( + { + "ensembl_gene_id": z_matrix["ensembl_gene_id"], + "zscore": combined_z_matrix, + "source": "combined", + } + ) + stack_df = pd.concat([stack_df, combined_df]) + # graph_zscore_distribution( + # df=stack_df, + # title=f"Combined Z-score Distribution for {context}", + # output_filepath=output_graph_filepath, + # ) + return combined_z_matrix_df + + +def _begin_combining_distributions( + context_name: str, + input_matrices: InputMatrices, + batch_names: BatchNames, + source_weights: SourceWeights, + output_filepaths: OutputCombinedSourceFilepath, + output_figure_dirpath: Path, + output_final_model_scores: Path, + weighted_z_floor: int = -6, + weighted_z_ceiling: int = 6, +): + logger.info(f"Starting to combine z-scores for context '{context_name}'") + output_figure_dirpath.mkdir(parents=True, exist_ok=True) + + z_score_results: list[CombineOmicsInput] = [] + for source, matrix in input_matrices: + if matrix is None: + logger.trace(f"Source '{source.value}' is None, skipping") + continue + if source not in SourceTypes: + logger.critical(f"Invalid source; got '{source.value}', expected 'trna', 'mrna', 'scrna', or 'proteomics'.") + raise ValueError("Invalid source") + batch_results: list[pd.DataFrame] = [] + for batch in batch_names[source.value]: + send_df: pd.DataFrame = matrix[[GeneIdentifier.ENSEMBL_GENE_ID.value, *batch.sample_names]].copy() + send_df.set_index(GeneIdentifier.ENSEMBL_GENE_ID.value, drop=True, inplace=True) + + batch_results.append( + _combine_z_distribution_for_batch( + context_name=context_name, + batch=batch, + matrix=send_df, + source=source, + output_combined_matrix_filepath=( + output_filepaths[source.value].parent / f"{context_name}_{source.value}_batch{batch.batch_num}_combined_z_distribution.csv" + ), + output_figure_dirpath=output_figure_dirpath, + weighted_z_floor=weighted_z_floor, + weighted_z_ceiling=weighted_z_ceiling, + ) + ) + + merged_batch_results = pd.DataFrame() + for df in batch_results: + merged_batch_results = ( + df if merged_batch_results.empty else merged_batch_results.merge(df, left_index=True, right_index=True, how="outer") + ) + + merged_source_results: pd.DataFrame = _combine_z_distribution_for_source( + merged_source_data=merged_batch_results, + context_name=context_name, + num_replicates=sum(batch.num_samples for batch in batch_names[source.value]), + output_combined_matrix_filepath=( + output_filepaths[source.value].parent / f"{context_name}_{source.value}_combined_zscore_distribution.csv" + ), + output_figure_filepath=(output_figure_dirpath / f"{context_name}_{source.value}_combined_zscore_distribution.pdf"), + weighted_z_floor=weighted_z_floor, + weighted_z_ceiling=weighted_z_ceiling, + ) + z_score_results.append( + CombineOmicsInput( + z_score_matrix=merged_source_results, + type=source, + weight=source_weights[source.value], + ) + ) + merged_source_results.to_csv(output_filepaths[source.value], index=True) + logger.success(f"Wrote z-scores for source '{source.value}' in context '{context_name}' to '{output_filepaths[source.value]}'") + + logger.trace(f"Combining z-score distributions for all sources in context '{context_name}'") + merged_context_results = _combine_z_distribution_for_context( + context=context_name, + zscore_results=z_score_results, + output_graph_filepath=output_figure_dirpath / f"{context_name}_combined_omics_distribution.pdf", + ) + merged_context_results.to_csv(output_final_model_scores, index=True) + logger.success(f"Finished combining z-scores for context '{context_name}'") diff --git a/main/como/create_context_specific_model.py b/main/como/create_context_specific_model.py new file mode 100644 index 00000000..1d5b6863 --- /dev/null +++ b/main/como/create_context_specific_model.py @@ -0,0 +1,831 @@ +from __future__ import annotations + +import collections +import re +import sys +from collections.abc import Sequence +from io import TextIOWrapper +from pathlib import Path +from typing import Literal, TextIO, cast + +import cobra +import cobra.util.array +import numpy as np +import numpy.typing as npt +import pandas as pd +from cobra import Model +from cobra.flux_analysis import pfba +from loguru import logger +from troppo.methods.reconstruction.fastcore import FASTcore, FastcoreProperties +from troppo.methods.reconstruction.gimme import GIMME, GIMMEProperties +from troppo.methods.reconstruction.imat import IMAT, IMATProperties +from troppo.methods.reconstruction.tINIT import tINIT, tINITProperties + +from como.data_types import Algorithm, BoundaryReactions, BuildResults, CobraCompartments, LogLevel, Solver +from como.utils import _log_and_raise_error, read_file, set_up_logging, split_gene_expression_data + + +def _correct_bracket(rule: str, name: str) -> str: + """Correct GPR rules to format readable by. + + Args: + rule: GPR rule string from a COBRA model + name: Gene name string from a COBRA model + + Returns: + A corrected GPR rule string + """ + rule_match = re.search(r"or|and", rule) + name_match = re.search(r"or|and", name) + if rule_match is None or name_match is None: + left_rule = rule + left_name = name.strip() + right_rule = "" + right_name = "" + operator = "" + else: + left_rule = rule[: rule_match.span()[0]] + left_name = name[: name_match.span()[0]].strip() + right_rule = rule[rule_match.span()[1] :] + right_name = name[name_match.span()[1] :] + operator = rule_match.group() + + new_right_rule = [] + for char in list(left_rule): + if char.isspace() or char.isdigit(): + new_right_rule.append(char) + elif len(left_name) > 0 and char == left_name[0]: + new_right_rule.append(char) + left_name = left_name[1:] + new_left_rule = "".join(new_right_rule).strip() + final_right_rule = "" if rule_match is None else _correct_bracket(right_rule, right_name) + return " ".join([new_left_rule, operator, final_right_rule]).strip() + + +def _gene_rule_logical(gpr_expression: str, level: int = 0) -> str: + """Create an expression from GPR rule which can be evaluated as true or false. + + Args: + gpr_expression: GPR rule string from a COBRA model + level: Current recursion level (used for debugging) + + Returns: + An evaluable string where "and" is replaced with "min" and "or" is replaced with "max" + """ + try: + loc_r = gpr_expression.index(")") + except ValueError: + if "and" in gpr_expression: + gpr_expression = gpr_expression.replace(" and ", ", ") + return "min{" + gpr_expression + "}" + elif "or" in gpr_expression: + gpr_expression = gpr_expression.replace(" or ", ", ") + return "max{" + gpr_expression + "}" + else: + gpr_expression = gpr_expression.replace("[", "") + return gpr_expression.replace("]", "") + + loc_l = gpr_expression[:loc_r].rindex("(") + inner_string = gpr_expression[loc_l : loc_r + 1] + inner_string = inner_string.replace("(", "[") + inner_string = inner_string.replace(")", "]") + if "and" in inner_string: + inner_string = inner_string.replace("and", ",") + inner_string = "min{" + inner_string + "}" + elif "or" in inner_string: + inner_string = inner_string.replace("or", ",") + inner_string = "max{" + inner_string + "}" + else: + inner_string = inner_string.replace("[", "") + inner_string = inner_string.replace("]", "") + + expression_out = f"{gpr_expression[:loc_l]}{inner_string}{gpr_expression[loc_r + 1 :]}" + expression_out = _gene_rule_logical(expression_out, level + 1) + + return expression_out + + +def _set_boundaries( + model: cobra.Model, + boundary_reactions: list[str], + lower_bounds: list[float], + upper_bounds: list[float], +) -> cobra.Model: + # get boundary reactions + exchange_rxns = [rxn.id for rxn in model.reactions if "EX_" in rxn.id] + sink_rxns = [rxn.id for rxn in model.reactions if "sink_" in rxn.id] + demand_rxns = [rxn.id for rxn in model.reactions if "DM_" in rxn.id] + + # Allows all boundary reactions to be used if none are given + allow_all_boundary_rxns = not boundary_reactions + + # close sinks and demands not in boundary reactions unless no boundary reactions were given + if not allow_all_boundary_rxns: + for i, rxn in enumerate(sink_rxns): # set sinks to 0 + getattr(model.reactions, rxn).lower_bounds = lower_bounds[i] if rxn in boundary_reactions else 0 + getattr(model.reactions, rxn).upper_bounds = upper_bounds[i] if rxn in boundary_reactions else 1000 + + for i, rxn in enumerate(demand_rxns): + getattr(model.reactions, rxn).lower_bounds = 0 + getattr(model.reactions, rxn).upper_bounds = upper_bounds[i] if rxn in boundary_reactions else 0 + + # Reaction media + medium = model.medium + for rxn in exchange_rxns: # open exchanges from exchange file, close unspecified exchanges + medium[rxn] = -float(lower_bounds[boundary_reactions.index(rxn)]) if rxn in boundary_reactions else 0.0 + model.medium = medium + + return model + + +def _feasibility_test(model_cobra: cobra.Model, step: str): + # check number of unsolvable reactions for reference model under media assumptions + # create flux consistant model (rmemoves some reactions) + model_cobra_rm = cobra.flux_analysis.fastcc(model_cobra, flux_threshold=15, zero_cutoff=1e-7) + incon_rxns = set(model_cobra.reactions.list_attr("id")) - set(model_cobra_rm.reactions.list_attr("id")) + incon_rxns_cnt = len(incon_rxns) + + if step == "before_seeding": + logger.warning( + f"Under given boundary assumptions, there are {incon_rxns_cnt} infeasible reactions in" + f" the reference model. These reactions will not be considered active in " + f"context specific model construction. If any infeasible reactions are found to be " + f"active according to expression data, or are found in the force reactions list, " + f"they can be found found in 'InfeasibleRxns.csv'. It is normal for this value to be quite large; " + f"however, if many of these reactions are active according to your expression data, " + f"it is likely that you are missing some critical exchange (media) reactions." + ) + elif step == "after_seeding": + logger.warning( + f"Under given boundary assumptions, with infeasible reactions from the general model not " + f"considered there are {incon_rxns_cnt} new infeasible reactions in the context-specific model. " + f"These reactions will be removed from the output model to ensure the model is solvable. " + f"Note that this value should be very low compared to the reference model." + ) + + return incon_rxns, model_cobra_rm + + +def _build_with_gimme( + reference_model: cobra.Model, + lower_bounds: Sequence[float | np.floating], + upper_bounds: Sequence[float | np.floating], + idx_objective: int, + expr_vector: npt.NDArray[np.floating], +): + model_reconstruction = reference_model.copy() + s_matrix: npt.NDArray[np.floating] = cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction) + # `Becker and Palsson (2008). Context-specific metabolic networks are + # consistent with experiments. PLoS Comput. Biol. 4, e1000082.` + properties = GIMMEProperties( + exp_vector=expr_vector, # np.array(gimme_data['0']), + obj_frac=0.9, + objectives=[{idx_objective: 1}], + preprocess=True, + flux_threshold=0.9, + ) + algorithm = GIMME(s_matrix, lower_bounds, upper_bounds, properties) + gene_activity = algorithm.run() + reaction_ids = [r.id for r in model_reconstruction.reactions] + to_remove_ids = [reaction_ids[r] for r in np.where(gene_activity == 0)[0]] + + model_reconstruction.remove_reactions(to_remove_ids, True) + psol = pfba(model_reconstruction) # noqa: F841 + # reaction_ids = [r.id for r in context_cobra_model.reactions] + # psol = context_cobra_model.optimize() + # to_remove_ids = [reaction_ids[r] for r in np.where(abs(psol.fluxes) < 1e-8)[0]] + # context_cobra_model.remove_reactions(to_remove_ids, True) + + return model_reconstruction + + +def _build_with_fastcore(cobra_model, s_matrix, lower_bounds, upper_bounds, exp_idx_list, solver): + # 'Vlassis, Pacheco, Sauter (2014). Fast reconstruction of compact + # context-specific metabolic network models. PLoS Comput. Biol. 10, + # e1003424.' + logger.warning("Fastcore requires a flux consistant model is used as refererence, to achieve this fastcc is required which is NOT reproducible.") + logger.debug("Creating feasible model") + _, cobra_model = _feasibility_test(cobra_model, "other") + properties = FastcoreProperties(core=exp_idx_list, solver=solver) + algorithm = FASTcore(s_matrix, lower_bounds, upper_bounds, properties) + context_rxns = algorithm.fastcore() + context_cobra_model = cobra_model.copy() + r_ids = [r.id for r in context_cobra_model.reactions] + remove_rxns = [r_ids[int(i)] for i in range(s_matrix.shape[1]) if i not in context_rxns] + context_cobra_model.remove_reactions(remove_rxns, True) + + return context_cobra_model + + +def _build_with_imat( + reference_model: cobra.Model, + lower_bounds: Sequence[float], + upper_bounds: Sequence[float], + expr_vector: npt.NDArray, + expr_thresh: tuple[float, float], + force_gene_indices: Sequence[int], + solver: str, +) -> cobra.Model: + properties: IMATProperties = IMATProperties( + exp_vector=expr_vector, + exp_thresholds=expr_thresh, + core=force_gene_indices, + epsilon=0.01, + solver=solver.upper(), + ) + + # Creating a copy of the model ensures we don't make any in-place modifications by accident + # Using cobra to create the stoichiometry matrix means we have less work to do + force_gene_indices = np.array(force_gene_indices, dtype=np.uint16) + model_reconstruction: cobra.Model = reference_model.copy() + s_matrix: npt.NDArray[np.floating] = cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction) + algorithm: IMAT = IMAT(S=s_matrix, lb=np.array(lower_bounds), ub=np.array(upper_bounds), properties=properties) + rxns_from_imat: npt.NDArray[np.uint16] = algorithm.run().astype(np.uint16) + + # Collect all reaction IDs and their associated index (e.g., HEX1 is at index 123) + all_rxn_ids: npt.NDArray[str] = np.array([r.id for r in model_reconstruction.reactions], dtype=object) + all_rxn_indices: npt.NDArray[np.uint16] = np.array(range(len(model_reconstruction.reactions)), dtype=np.uint16) + + # Collect reactions to keep by creating a unique set of reactions from the iMAT algorithm and force-include reactions + # dtype is set to uint16 because indices will not be below 0 or be greater than 65,535 (max size of uint16), + # because only ~10,000 reactions exist in Recon3D + # Unsafe casting is OK because of these facts. + rxn_indices_to_keep: npt.NDArray[np.uint16] = np.unique(np.concatenate([rxns_from_imat, force_gene_indices], dtype=np.uint16)) + + # Reaction indices to exclude from the model are thus reactions that are not marked to be included in the model + # Assume unique is false because every value that is in `rxn_indices_to_keep` is included in `all_rxn_indices` + rxn_indices_to_remove: npt.NDArray[np.uint16] = np.setdiff1d(all_rxn_indices, rxn_indices_to_keep, assume_unique=False) + model_reconstruction.remove_reactions(reactions=all_rxn_ids[rxn_indices_to_remove].tolist(), remove_orphans=True) + + return model_reconstruction + + +def _build_with_tinit( + reference_model: cobra.Model, + lower_bounds, + upper_bounds, + expr_vector, + solver, + idx_force, +) -> Model: + properties = tINITProperties( + reactions_scores=expr_vector, + solver=solver, + essential_reactions=idx_force, + production_weight=0.0, + allow_excretion=False, + no_reverse_loops=True, + ) + model_reconstruction = reference_model.copy() + s_matrix: npt.NDArray[np.floating] = cobra.util.array.create_stoichiometric_matrix(model=model_reconstruction).astype(np.float64) + algorithm = tINIT(s_matrix, lower_bounds, upper_bounds, properties) + algorithm.preprocessing() + algorithm.build_problem() + _log_and_raise_error("tINIT is not yet implemented.", error=NotImplementedError, level=LogLevel.CRITICAL) + + +async def _map_expression_to_reaction( + reference_model, + gene_expression_file, + recon_algorithm: Algorithm, + low_thresh: float, + high_thresh: float, +) -> collections.OrderedDict[str, int]: + """Map gene ids to a reaction based on GPR (gene to protein to reaction) association rules. + + These rules should be defined in the general genome-scale metabolic model + + Args: + reference_model: A COBRA model object representing the general genome-scale metabolic model. + gene_expression_file: Path to a gene expression file (.csv, .tsv, .xlsx, or .xls) + recon_algorithm: Algorithm to use for reconstruction (GIMME, FASTCORE, iMAT, or tINIT) + low_thresh: Low expression threshold for algorithms that require it (iMAT, tINIT) + high_thresh: High expression threshold for algorithms that require it (iMAT, tINIT) + + Returns: + An ordered dictionary mapping reaction IDs to their corresponding expression values. + + Raises: + ValueError: If neither 'entrez_gene_id' nor 'ensembl_gene_id' columns are found in the gene expression file. + """ + expression_data = await read_file(gene_expression_file) + identifier_column = next((col for col in ("entrez_gene_id", "ensembl_gene_id") if col in expression_data.columns), "") + + if not identifier_column: + raise ValueError( + f"At least one column of 'entrez_gene_id' or 'ensembl_gene_id' could not be found in the gene expression file '{gene_expression_file}'" + ) + gene_activity = split_gene_expression_data( + expression_data, + identifier_column=cast(Literal["ensembl_gene_id", "entrez_gene_id"], identifier_column), + recon_algorithm=recon_algorithm, + ) + reaction_expression = collections.OrderedDict() + + # fmt: off + # Define a default expression value if a gene ID is not found + default_expression = ( + np.mean([low_thresh, high_thresh]) if recon_algorithm in {Algorithm.IMAT, Algorithm.TINIT} + else -1 if recon_algorithm == Algorithm.GIMME + else 0 if recon_algorithm == Algorithm.FASTCORE + else 1 + ) + # fmt: on + + error_count = 0 + for rxn in reference_model.reactions: + rxn: cobra.Reaction + + gene_reaction_rule = rxn.gene_reaction_rule + if not gene_reaction_rule: + continue + + gene_ids = set(re.findall(r"\d+", gene_reaction_rule)) + reaction_expression[rxn.id] = default_expression + for gene_id in gene_ids: + activity = gene_activity.at[gene_id, "active"] if gene_id in gene_activity.index else f"{default_expression!s}" + # replace gene_id with activity, using optional whitespace before and after the gene id + # Do not replace the whitespace (if it exists) before and after the gene ID + gene_reaction_rule = re.sub(pattern=rf"\b{gene_id}\b", repl=activity, string=gene_reaction_rule) + + try: + # We are using eval here because ast.literal_eval is unable to process an evaluable such as `max(-4, 0, 1)` + # This isn't ideal, but ultimately the only other option is writing and maintaining a custom parsing engine, which is too much work + evaluable_gene_rule = _gene_rule_logical(gene_reaction_rule).replace("{", "(").replace("}", ")") + reaction_expression[rxn.id] = eval(evaluable_gene_rule) # noqa: S307 + except ValueError: + error_count += 1 + + logger.debug(f"Mapped gene expression to reactions, found {error_count} error(s).") + # expr_vector = np.array(list(reaction_expression.values()), dtype=float) + + return reaction_expression + + +async def _build_model( # noqa: C901 + general_model_file: Path, + gene_expression_file: Path, + recon_algorithm: Algorithm, + objective: str, + boundary_reactions: list[str], + exclude_reactions: list[str], + force_reactions: list[str], + lower_bounds: list[float], + upper_bounds: list[float], + solver: str, + low_thresh: float, + high_thresh: float, + output_flux_result_filepath: Path, + *, + force_boundary_rxn_inclusion: bool, +) -> BuildResults: + """Seed a context specific reference_model. + + Core reactions are determined from GPR associations with gene expression logicals. + Core reactions that do not necessarily meet GPR association requirements can be forced if in the force reaction + file. Metabolite exchange (media), sinks, and demands are determined from exchanges file. Reactions can also be + force excluded even if they meet GPR association requirements using the force exclude file. + + Args: + general_model_file: Path to a COBRA model file (.xml, .mat, or .json) + gene_expression_file: Path to a gene expression file (.csv, .tsv, .xlsx, or .xls) + recon_algorithm: Algorithm to use for reconstruction (GIMME, FASTCORE, iMAT, or tINIT) + objective: Objective reaction ID in the general model + boundary_reactions: List of boundary reactions to set in the model + exclude_reactions: List of reactions to exclude from the model + force_reactions: List of reactions to force include in the model + lower_bounds: List of lower bounds corresponding to boundary reactions + upper_bounds: List of upper bounds corresponding to boundary reactions + solver: Solver to use (e.g., 'glpk', 'cplex', 'gurobi') + low_thresh: Low expression threshold for algorithms that require it (iMAT, tINIT) + high_thresh: High expression threshold for algorithms that require it (iMAT, tINIT) + output_flux_result_filepath: Path to save flux results (for iMAT only) + force_boundary_rxn_inclusion: If True, ensure that all boundary reactions are included in the final model. + + Returns: + A _BuildResults object containing the context-specific model, list of expression indices used, and a DataFrame of infeasible reactions. + """ + reference_model: cobra.Model + match general_model_file.suffix: + case ".mat": + reference_model = cobra.io.load_matlab_model(general_model_file) + case (".xml", ".sbml"): + reference_model = cobra.io.read_sbml_model(general_model_file) + case ".json": + reference_model = cobra.io.load_json_model(general_model_file) + case _: + _log_and_raise_error( + f"Reference model format must be .xml, .mat, or .json; found '{general_model_file.suffix}'", + error=ValueError, + level=LogLevel.ERROR, + ) + + if objective not in force_reactions: + force_reactions.append(objective) + reference_model = _set_boundaries(reference_model, boundary_reactions, lower_bounds, upper_bounds) + reference_model.solver = solver.lower() + + # check number of unsolvable reactions for reference model under media assumptions + # inconsistent_reactions, cobra_model = _feasibility_test(cobra_model, "before_seeding") + inconsistent_reactions = [] + s_matrix = cobra.util.array.create_stoichiometric_matrix(reference_model, array_type="dense") + lower_bounds = [] + upper_bounds = [] + reaction_ids = [] + for reaction in reference_model.reactions: + lower_bounds.append(reaction.lower_bound) + upper_bounds.append(reaction.upper_bound) + reaction_ids.append(reaction.id) + + # get expressed reactions + reaction_expression: collections.OrderedDict[str, int] = await _map_expression_to_reaction( + reference_model, + gene_expression_file, + recon_algorithm, + high_thresh=high_thresh, + low_thresh=low_thresh, + ) + expression_vector: npt.NDArray[np.int32] = np.array(list(reaction_expression.values()), dtype=np.int32) + + for rxn in force_reactions: + if rxn not in reaction_ids: + logger.warning( + f"The force reaction '{rxn}' was not found in the reference model. " + f"Check BiGG, or the relevant database for your reference model, for synonyms." + ) + + # collect list of reactions that are infeasible but active in expression data or user defined + infeasible_expression_reactions = [] + infeasible_force_reactions = [] + + for i, rxn in enumerate(reaction_expression): + # log reactions in expressed and force lists that are infeasible that the user may wish to review + if rxn in inconsistent_reactions and expression_vector[i] == 1: + infeasible_expression_reactions.append(rxn) + if rxn in inconsistent_reactions and rxn in force_reactions: + infeasible_force_reactions.append(rxn) + + if rxn in force_reactions: + expression_vector[i] = high_thresh + 0.1 if recon_algorithm in {Algorithm.TINIT, Algorithm.IMAT} else 1 + if rxn in inconsistent_reactions or rxn in exclude_reactions: + expression_vector[i] = low_thresh - 0.1 if recon_algorithm in {Algorithm.TINIT, Algorithm.IMAT} else 0 + + objective_index = reaction_ids.index(objective) + + if force_boundary_rxn_inclusion: + all_forced: set[str] = {*force_reactions, *boundary_reactions} + force_reaction_indices: npt.NDArray[np.uint16] = np.array( + [reaction_ids.index(rxn) for rxn in all_forced if rxn in reaction_ids], dtype=np.uint16 + ) + else: + force_reaction_indices: npt.NDArray[np.uint16] = np.array( + [reaction_ids.index(rxn) for rxn in force_reactions if rxn in reaction_ids], dtype=np.uint16 + ) + + expression_vector_indices = [i for (i, val) in enumerate(expression_vector) if val > 0] + expression_threshold = (low_thresh, high_thresh) + + match recon_algorithm: + case Algorithm.GIMME: + context_model_cobra: cobra.Model = _build_with_gimme( + reference_model=reference_model, + lower_bounds=lower_bounds, + upper_bounds=upper_bounds, + idx_objective=objective_index, + expr_vector=expression_vector, + ) + case Algorithm.FASTCORE: + context_model_cobra: cobra.Model = _build_with_fastcore( + cobra_model=reference_model, + s_matrix=s_matrix, + lower_bounds=lower_bounds, + upper_bounds=upper_bounds, + exp_idx_list=expression_vector_indices, + solver=solver, + ) + case Algorithm.IMAT: + context_model_cobra: cobra.Model = _build_with_imat( + reference_model=reference_model, + lower_bounds=lower_bounds, + upper_bounds=upper_bounds, + expr_vector=expression_vector, + expr_thresh=expression_threshold, + force_gene_indices=force_reaction_indices, + solver=solver, + ) + context_model_cobra.objective = objective + flux_sol: cobra.Solution = context_model_cobra.optimize() + fluxes: pd.Series = flux_sol.fluxes + model_reactions: list[str] = [reaction.id for reaction in context_model_cobra.reactions] + reaction_intersections: set[str] = set(fluxes.index).intersection(model_reactions) + flux_df: pd.DataFrame = fluxes[~fluxes.index.isin(reaction_intersections)] + flux_df.dropna(inplace=True) + flux_df.to_csv(output_flux_result_filepath) + case Algorithm.TINIT: + context_model_cobra: cobra.Model = _build_with_tinit( + reference_model=reference_model, + lower_bounds=lower_bounds, + upper_bounds=upper_bounds, + expr_vector=expression_vector, + solver=solver, + idx_force=force_reaction_indices, + ) + case _: + _log_and_raise_error( + ( + f"Reconstruction algorithm must be {Algorithm.GIMME.value}, " + f"{Algorithm.FASTCORE.value}, {Algorithm.IMAT.value}, or {Algorithm.TINIT.value}. " + f"Got: {recon_algorithm.value}" + ), + error=ValueError, + level=LogLevel.ERROR, + ) + + inconsistent_and_infeasible_reactions: pd.DataFrame = pd.concat( + [ + pd.DataFrame({"infeasible_reactions": inconsistent_reactions}), + pd.DataFrame({"expressed_infeasible_reactions": infeasible_expression_reactions}), + pd.DataFrame({"infeasible_force_reactions": infeasible_force_reactions}), + pd.DataFrame({"infeasible_context_reactions": []}), # Included to maintain legacy support + ], + ignore_index=True, + axis=0, + ) + + return BuildResults( + model=context_model_cobra, + expression_index_list=expression_vector_indices, + infeasible_reactions=inconsistent_and_infeasible_reactions, + ) + + +async def _create_df(path: Path, *, lowercase_col_names: bool = False) -> pd.DataFrame: + if path.suffix not in {".csv", ".tsv"}: + raise ValueError(f"File must be a .csv or .tsv file, got '{path.suffix}'") + df: pd.DataFrame = await read_file(path=path, header=0, sep="," if path.suffix == ".csv" else "\t", h5ad_as_df=True) + + if not isinstance(df, pd.DataFrame): + _log_and_raise_error( + f"Expected a pandas.DataFrame, got {type(df)}", + error=TypeError, + level=LogLevel.ERROR, + ) + + if lowercase_col_names: + df.columns = [c.lower() for c in df.columns] + return df + + +async def _collect_boundary_reactions(path: Path) -> BoundaryReactions: + df: pd.DataFrame = await _create_df(path, lowercase_col_names=True) + for column in df.columns: + if column not in [ + "reaction", + "abbreviation", + "compartment", + "minimum reaction rate", + "maximum reaction rate", + ]: + _log_and_raise_error( + ( + f"Boundary reactions file must have columns named 'Reaction', 'Abbreviation', 'Compartment', " + f"'Minimum Reaction Rate', and 'Maximum Reaction Rate'. Found: {column}" + ), + error=ValueError, + level=LogLevel.ERROR, + ) + + reactions: list[str] = [""] * len(df) + boundary_type: list[str] = df["reaction"].tolist() + reaction_abbreviation: list[str] = list(df["abbreviation"].astype(str)) + reaction_compartment: list[str] = list(df["compartment"].astype(str)) + boundary_map = {"exchange": "EX", "demand": "DM", "sink": "SK"} + for i in range(len(boundary_type)): + boundary: str = boundary_type[i].lower() + if boundary not in boundary_map: + _log_and_raise_error( + f"Boundary reaction type must be 'Exchange', 'Demand', or 'Sink'. Found: {boundary}", + error=ValueError, + level=LogLevel.ERROR, + ) + + shorthand_compartment = CobraCompartments.get_shorthand(reaction_compartment[i]) + reactions[i] = f"{boundary_map.get(boundary)}_{reaction_abbreviation[i]}[{shorthand_compartment}]" + + return BoundaryReactions( + reactions=reactions, + lower_bounds=df["minimum reaction rate"].tolist(), + upper_bounds=df["maximum reaction rate"].tolist(), + ) + + +def _write_model_to_disk(context_name: str, model: cobra.Model, output_filepaths: list[Path]) -> None: + for path in output_filepaths: + path.parent.mkdir(parents=True, exist_ok=True) + if path.suffix == ".mat": + cobra.io.save_matlab_model(model=model, file_name=path) + elif path.suffix == ".json": + cobra.io.save_json_model(model=model, filename=path, pretty=True) + elif path.suffix in {".sbml", ".xml"}: + cobra.io.write_sbml_model(cobra_model=model, filename=path) + else: + _log_and_raise_error( + f"Invalid output model filetype. Should be one of .xml, .sbml, .mat, or .json. Got '{path.suffix}'", + error=ValueError, + level=LogLevel.ERROR, + ) + logger.success(f"Saved metabolic model for context '{context_name}' to '{path}'") + + +async def create_context_specific_model( # noqa: C901 + context_name: str, + reference_model: Path, + active_genes_filepath: Path, + output_infeasible_reactions_filepath: Path, + output_flux_result_filepath: Path, + output_model_filepaths: Path | list[Path], + output_filetypes: list[str] | None = None, + output_fastcore_expression_index_filepath: Path | None = None, + objective: str = "biomass_reaction", + boundary_rxns_filepath: str | Path | None = None, + exclude_rxns_filepath: str | Path | None = None, + force_rxns_filepath: str | Path | None = None, + algorithm: Algorithm = Algorithm.GIMME, + low_threshold: float = -5, + high_threshold: float = -3, + solver: Solver = Solver.GLPK, + log_level: LogLevel = LogLevel.INFO, + log_location: str | TextIO | TextIOWrapper = sys.stderr, + *, + force_boundary_rxn_inclusion: bool = True, +): + """Create a context-specific model using the provided data. + + Args: + context_name: Name of the context-specific model. + reference_model: Path to the general genome-scale metabolic model file (.xml, .mat, or .json). + active_genes_filepath: Path to the gene expression data file (csv, tsv, or Excel). + output_infeasible_reactions_filepath: Path to save infeasible reactions (csv). + output_flux_result_filepath: Path to save flux results (csv). + output_model_filepaths: Path or list of paths to save the context-specific model (.xml, .mat, or .json). + output_filetypes: List of file types to save the model as ('xml', 'mat', 'json'). + output_fastcore_expression_index_filepath: Path to save Fastcore expression indices (txt). Required if using Fastcore. + objective: Objective function reaction ID. + boundary_rxns_filepath: Optional path to boundary reactions file (csv, tsv, or Excel). + exclude_rxns_filepath: Optional path to reactions to exclude file (csv, tsv, or Excel). + force_rxns_filepath: Optional path to reactions to force include file (csv, tsv, or Excel). + algorithm: Algorithm to use for reconstruction. One of Algorithm.GIMME, Algorithm.FASTCORE, Algorithm.IMAT, Algorithm.TINIT. + low_threshold: Low expression threshold for algorithms that require it. + high_threshold: High expression threshold for algorithms that require it. + solver: Solver to use. One of Solver.GLPK, Solver.CPLEX, Solver.GUROBI + log_level: Logging level. One of LogLevel.DEBUG, LogLevel.INFO, LogLevel.WARNING, LogLevel.ERROR, LogLevel.CRITICAL + log_location: Location for log output. Can be a file path or sys.stderr/sys.stdout. + force_boundary_rxn_inclusion: If True, ensure that all provided boundary reactions are included in the final model. + + Raises: + ImportError: If Gurobi solver is selected but gurobipy is not installed. + """ + boundary_rxns_filepath: Path | None = Path(boundary_rxns_filepath) if boundary_rxns_filepath else None + set_up_logging(level=log_level, location=log_location) + output_model_filepaths = [output_model_filepaths] if isinstance(output_model_filepaths, Path) else output_model_filepaths + for path in output_model_filepaths: + if path.suffix not in {".mat", ".xml", ".sbml", ".json"}: + _log_and_raise_error( + f"Invalid output model filetype. Should be one of .xml, .sbml, .mat, or .json. Got '{path.suffix}'", + error=ValueError, + level=LogLevel.ERROR, + ) + if len(output_model_filepaths) != len(output_model_filepaths): + _log_and_raise_error( + "The number of output model filepaths must be the same as the number of output flux result filepaths", + error=ValueError, + level=LogLevel.ERROR, + ) + + if not reference_model.exists(): + _log_and_raise_error( + f"Reference model not found at {reference_model}", + error=FileNotFoundError, + level=LogLevel.ERROR, + ) + if not active_genes_filepath.exists(): + _log_and_raise_error( + f"Active genes file not found at {active_genes_filepath}", + error=FileNotFoundError, + level=LogLevel.ERROR, + ) + if algorithm == Algorithm.FASTCORE and not output_fastcore_expression_index_filepath: + _log_and_raise_error( + "The fastcore expression index output filepath must be provided", + error=ValueError, + level=LogLevel.ERROR, + ) + if boundary_rxns_filepath and not boundary_rxns_filepath.exists(): + _log_and_raise_error( + f"Boundary reactions file not found at {boundary_rxns_filepath}", + error=FileNotFoundError, + level=LogLevel.ERROR, + ) + + output_filetypes = ["mat"] if output_filetypes is None else output_filetypes + for output_type in output_filetypes: + if output_type not in {"xml", "mat", "json"}: + _log_and_raise_error( + f"Output file type {output_type} not recognized. Must be one of: 'xml', 'mat', 'json'", + error=ValueError, + level=LogLevel.ERROR, + ) + + if algorithm not in Algorithm: + _log_and_raise_error( + f"Algorithm {algorithm} not supported. Use one of {', '.join(a.value for a in Algorithm)}", + error=ValueError, + level=LogLevel.ERROR, + ) + + if solver not in Solver: + _log_and_raise_error( + f"Solver '{solver}' not supported. Use one of {', '.join(s.value for s in Solver)}", + error=ValueError, + level=LogLevel.ERROR, + ) + + if boundary_rxns_filepath: + boundary_reactions = await _collect_boundary_reactions(boundary_rxns_filepath) + + exclude_rxns: list[str] = [] + if exclude_rxns_filepath: + exclude_rxns_filepath: Path = Path(exclude_rxns_filepath) + df = await _create_df(exclude_rxns_filepath) + if "abbreviation" not in df.columns: + _log_and_raise_error( + "The exclude reactions file should have a single column with a header named Abbreviation", + error=ValueError, + level=LogLevel.ERROR, + ) + exclude_rxns = df["abbreviation"].tolist() + + force_rxns: list[str] = [] + if force_rxns_filepath: + force_rxns_filepath: Path = Path(force_rxns_filepath) + df = await _create_df(force_rxns_filepath, lowercase_col_names=True) + if "abbreviation" not in df.columns: + _log_and_raise_error( + "The force reactions file should have a single column with a header named Abbreviation", + error=ValueError, + level=LogLevel.ERROR, + ) + force_rxns = df["abbreviation"].tolist() + + # Test that gurobi is using a valid license file + if solver == Solver.GUROBI: + # test if gurobi is available + try: + import gurobipy as gp + except ImportError as e: + logger.error( + "The gurobi solver requires the gurobipy package to be installed. " + "Please install gurobipy and try again. " + "This can be done by installing the 'gurobi' optional dependency." + ) + raise ImportError from e + + env = gp.Env() + if env.getParam("WLSACCESSID") == "" or env.getParam("WLSSECRET") == "": + logger.critical( + "Gurobi solver requested, but license information cannot be found. " + "COMO will continue, but it is HIGHLY unlikely the resulting model will be valid." + ) + # remove gurobi-related information, it is no longer required + del env, gp + + logger.info(f"Creating '{context_name}' model using '{algorithm.value}' reconstruction and '{solver.value}' solver") + build_results: BuildResults = await _build_model( + general_model_file=reference_model, + gene_expression_file=active_genes_filepath, + recon_algorithm=algorithm, + objective=objective, + boundary_reactions=boundary_reactions.reactions, + exclude_reactions=exclude_rxns, + force_reactions=force_rxns, + lower_bounds=boundary_reactions.lower_bounds, + upper_bounds=boundary_reactions.upper_bounds, + solver=solver.value.lower(), + low_thresh=low_threshold, + high_thresh=high_threshold, + output_flux_result_filepath=output_flux_result_filepath, + force_boundary_rxn_inclusion=force_boundary_rxn_inclusion, + ) + + build_results.infeasible_reactions.dropna(inplace=True) + build_results.infeasible_reactions.to_csv(output_infeasible_reactions_filepath, index=False) + + if algorithm == Algorithm.FASTCORE: + fastcore_df = pd.DataFrame(build_results.expression_index_list) + fastcore_df.dropna(inplace=True) + fastcore_df.to_csv(output_fastcore_expression_index_filepath, index=False) + + _write_model_to_disk(context_name=context_name, model=build_results.model, output_filepaths=output_model_filepaths) + logger.debug(f"Number of Genes: {len(build_results.model.genes):,}") + logger.debug(f"Number of Metabolites: {len(build_results.model.metabolites):,}") + logger.debug(f"Number of Reactions: {len(build_results.model.reactions):,}") diff --git a/main/como/data_types.py b/main/como/data_types.py new file mode 100644 index 00000000..fc77e0e7 --- /dev/null +++ b/main/como/data_types.py @@ -0,0 +1,274 @@ +from __future__ import annotations + +from collections.abc import Iterator +from dataclasses import dataclass, field, fields +from enum import Enum +from pathlib import Path +from typing import ClassVar, NamedTuple + +import cobra +import pandas as pd +from loguru import logger + +PATH_TYPE = str | Path +LOG_FORMAT = "{time:YYYY-MM-DD HH:mm:ss} | {level:<8} | {name}:{line} - {message}" + + +class AdjustmentMethod(Enum): + """Adjustment method for expression requirement based on differences in number of provided data source types.""" + + PROGRESSIVE = "progressive" + REGRESSIVE = "regressive" + FLAT = "flat" + CUSTOM = "custom" + + +class Algorithm(Enum): + GIMME = "GIMME" + FASTCORE = "FASTCORE" + IMAT = "IMAT" + TINIT = "TINIT" + + +class FilteringTechnique(Enum): + """RNA sequencing filtering capabilities.""" + + CPM = "cpm" + ZFPKM = "zfpkm" + TPM = "tpm" + UMI = "umi" + + +class GeneIdentifier(Enum): + ENSEMBL_GENE_ID = "ensembl_gene_id" + ENTREZ_GENE_ID = "entrez_gene_id" + GENE_SYMBOL = "gene_symbol" + + +class LogLevel(Enum): + TRACE = 5 + DEBUG = 10 + INFO = 20 + SUCCESS = 25 + WARNING = 30 + ERROR = 40 + CRITICAL = 50 + NONE = 100 + + +class RNAType(Enum): + TRNA = "total" + MRNA = "mrna" + SCRNA = "scrna" + + +class Solver(Enum): + """Solver used to seed context specific model.""" + + GLPK = "GLPK" + GUROBI = "GUROBI" + SCIPY = "SCIPY" + GLPK_EXACT = "GLPK_EXACT" + + +class SourceTypes(Enum): + TRNA = "trna" + MRNA = "mrna" + SCRNA = "scrna" + PROTEOMICS = "proteomics" + + +class PeakIdentificationParameters(NamedTuple): + height: float + distance: float + + +class CobraCompartments: + """Convert from compartment "long-hand" to "short-hand". + + Shorthand from: https://cobrapy.readthedocs.io/en/latest/_modules/cobra/medium/annotations.html + + "Extracellular" -> "e" + "golgi" -> "g" + """ + + SHORTHAND: ClassVar[dict[str, list[str]]] = { + "ce": ["cell envelope"], + "c": [ + "cytoplasm", + "cytosol", + "default", + "in", + "intra cellular", + "intracellular", + "intracellular region", + "intracellular space", + ], + "er": ["endoplasmic reticulum"], + "erm": ["endoplasmic reticulum membrane"], + "e": [ + "extracellular", + "extraorganism", + "out", + "extracellular space", + "extra organism", + "extra cellular", + "extra-organism", + "external", + "external medium", + ], + "f": ["flagellum", "bacterial-type flagellum"], + "g": ["golgi", "golgi apparatus"], + "gm": ["golgi membrane"], + "h": ["chloroplast"], + "l": ["lysosome"], + "im": ["mitochondrial intermembrane space"], + "mm": ["mitochondrial membrane"], + "m": ["mitochondrion", "mitochondria"], + "n": ["nucleus"], + "p": ["periplasm", "periplasmic space"], + "x": ["peroxisome", "glyoxysome"], + "u": ["thylakoid"], + "vm": ["vacuolar membrane"], + "v": ["vacuole"], + "w": ["cell wall"], + "s": ["eyespot", "eyespot apparatus", "stigma"], + } + + _REVERSE_LOOKUP: ClassVar[dict[str, str]] = {value.lower(): key for key, values in SHORTHAND.items() for value in values} + + @classmethod + def get_shorthand(cls, longhand: str) -> str | None: + """Get the short-hand compartment name from the long-hand name. + + Args: + longhand: The long-hand compartment name (e.g., 'cytoplasm', 'extracellular'). + + Returns: + The short-hand compartment name if found, None otherwise. + """ + return cls._REVERSE_LOOKUP.get(longhand.lower(), None) + + @classmethod + def get_longhand(cls, shorthand: str) -> str | None: + """Get the long-hand compartment name from the short-hand name. + + Args: + shorthand: The short-hand compartment name (e.g., 'c', 'e', 'm'). + + Returns: + The long-hand compartment name if found, None otherwise. + """ + longhand = cls.SHORTHAND.get(shorthand.lower(), None) + return longhand[0] if longhand else None + + +class BuildResults(NamedTuple): + """Results of building a context specific model.""" + + model: cobra.Model + expression_index_list: list[int] + infeasible_reactions: pd.DataFrame + + +class BoundaryReactions(NamedTuple): + """Boundary reactions to be used in the context specific model.""" + + reactions: list[str] + lower_bounds: list[float] + upper_bounds: list[float] + + +@dataclass +class _BatchEntry: + batch_num: int + sample_names: list[str] + _num_samples: int = field(init=False) + + def __post_init__(self): + self._num_samples = len(self.sample_names) + + @property + def num_samples(self): + return self._num_samples + + +@dataclass +class CombineOmicsInput: + z_score_matrix: pd.DataFrame + type: SourceTypes + weight: int + + +class _BaseDataType: + """Base class for common data types.""" + + def __getitem__(self, value: str): + """Access matrices using square bracket notation (e.g., `input_matrices['total_rna']`). + + Args: + value: The name of the matrix to get ('trna', 'mrna', 'scrna', 'proteomics') + + Returns: + The DataFrame if it exists, None otherwise. + """ + self._validate_attribute(value) + return getattr(self, value) + + def __setitem__(self, key, value): + """Set matrices using square bracket notation (e.g., `input_matrices['total_rna'] = new_df`). + + :param key: The key to set + :param value: The new value + """ + self._validate_attribute(key) + setattr(self, key, value) + + def _validate_attribute(self, key): + if key not in {i.value for i in SourceTypes._member_map_.values()}: + # Unable to use como.utils._log_and_raise_error because it results in a circular import + message = f"{key} is not a valid attribute of {SourceTypes.__name__}; got '{key}'" + logger.warning(message) + raise ValueError(message) + + def __iter__(self) -> Iterator[tuple[SourceTypes, pd.DataFrame | None]]: + """Iterate over matrix fields and their names. + + Yields: + A tuple containing (matrix_name, matrix_dataframe). + + """ + for field_ in fields(self): + yield SourceTypes(field_.name), getattr(self, field_.name) + + +@dataclass(slots=True) +class BatchNames(_BaseDataType): + trna: list[_BatchEntry] + mrna: list[_BatchEntry] + scrna: list[_BatchEntry] + proteomics: list[_BatchEntry] + + +@dataclass(slots=True) +class InputMatrices(_BaseDataType): + trna: pd.DataFrame | None = None + mrna: pd.DataFrame | None = None + scrna: pd.DataFrame | None = None + proteomics: pd.DataFrame | None = None + + +@dataclass(slots=True) +class OutputCombinedSourceFilepath(_BaseDataType): + trna: Path | None + mrna: Path | None + scrna: Path | None + proteomics: Path | None + + +@dataclass(slots=True) +class SourceWeights(_BaseDataType): + trna: int + mrna: int + scrna: int + proteomics: int diff --git a/main/como/disease_analysis.py b/main/como/disease_analysis.py new file mode 100644 index 00000000..8eb7b52e --- /dev/null +++ b/main/como/disease_analysis.py @@ -0,0 +1,180 @@ +# ruff: noqa + + +import argparse +import json +from pathlib import Path + +import pandas as pd +import rpy2.robjects as ro +import rpy2_api +from fast_bioservices import BioDBNet, Input, Output +from project import Config +from rpy2.robjects import pandas2ri +from rpy2.robjects.packages import importr + +configs = Config() + +pandas2ri.activate() + +# import R libraries +DESeq2 = importr("DESeq2") +edgeR = importr("edgeR") +readxl = importr("readxl") + +DGEio = rpy2_api.Rpy2(r_file_path=Path(configs.code_dir, "rscripts", "DGE.R")) + + +def get_rnaseq_diff_gene_exp(config_filepath, disease_name, context_name, taxon_id): + """ + Get differential gene expression for RNA-seq data + + param: config_filepath - path to rna-seq disease configuration xlsx file + param: disease_name - string, disease name which should correspond to sheet name in disease config xlsx file + param: context_name - string, context name which should correspond to folder in 'results' folder + + return: dataframe with fold changes, FDR adjusted p-values, + """ + count_matrix_filename = "".join(["gene_counts_matrix_", disease_name, "_", context_name, ".csv"]) + count_matrix_path: Path = configs.data_dir / "data_matrices" / context_name / "disease" / count_matrix_filename + + if count_matrix_path.exists(): + print("Count Matrix File is at ", count_matrix_path) + else: + raise FileNotFoundError(f"Count Matrix File not found at {count_matrix_path}") + + diff_exp_df = DGEio.call_function("DGE_main", count_matrix_path, config_filepath, context_name, disease_name) + diff_exp_df = ro.conversion.rpy2py(diff_exp_df) + gse_id = "rnaseq" + + biodbnet = BioDBNet() + bdnet = biodbnet.db2db( + input_values=list(map(str, diff_exp_df["Ensembl"].tolist())), + input_db=Input.ENSEMBL_GENE_ID, + output_db=[Output.GENE_ID, Output.AFFY_ID, Output.GENE_SYMBOL], + taxon=taxon_id, + ) + + diff_exp_df["Affy"] = bdnet["Affy ID"].tolist() + diff_exp_df["Entrez"] = bdnet["Gene ID"].tolist() + diff_exp_df["Symbol"] = bdnet["Gene Symbol"].tolist() + + return diff_exp_df, gse_id + + +def write_outputs(diff_exp_df, gse_id, context_name, disease_name, target_path): + search_col = "Ensembl" + diff_exp_df["logFC"].astype(float) + diff_exp_df["abs_logFC"] = diff_exp_df["logFC"].abs() + diff_exp_df["FDR"].astype(float) + diff_exp_df.sort_values(by="abs_logFC", ascending=False, inplace=True) + regulated = diff_exp_df[diff_exp_df["FDR"] < 0.05] + down_regulated = regulated[regulated["logFC"] < 0] + up_regulated = regulated[regulated["logFC"] > 0] + diff_exp_df["regulated"] = [ + "unchanged" + if gene not in regulated[search_col].tolist() + else ("upregulated" if gene in up_regulated[search_col].tolist() else "downregulated") + for gene in diff_exp_df[search_col].tolist() + ] + up_file = configs.result_dir / context_name / disease_name / f"Disease_UP_{gse_id}.txt" + down_file = configs.result_dir / context_name / disease_name / f"Disease_DOWN_{gse_id}.txt" + + up_file.parent.mkdir(parents=True, exist_ok=True) + down_file.parent.mkdir(parents=True, exist_ok=True) + + up_regulated = up_regulated[up_regulated["Entrez"] != "-"] + down_regulated = down_regulated[down_regulated["Entrez"] != "-"] + + up_regulated["Entrez"].to_csv(up_file, index=False) + down_regulated["Entrez"].to_csv(down_file, index=False) + print(f"Upregulated genes saved to '{up_file}'") + print(f"Downregulated genes saved to '{down_file}'") + + raw_file = configs.result_dir / context_name / disease_name / f"Raw_Fit_{gse_id}.csv" + diff_exp_df.drop(columns=["Affy"], inplace=True) # drop for now bc commas mess up csv parsing, maybe fix later + diff_exp_df.to_csv(raw_file, index=False) + print(f"Raw Data saved to '{raw_file}'") + + files_dict = { + "gse": gse_id, + "up_regulated": up_file, + "down_regulated": down_file, + "raw_data": raw_file, + } + + files_json = configs.result_dir / context_name / disease_name / "step2_results_files.json" + files_json.parent.mkdir(parents=True, exist_ok=True) + with open(files_json, "w") as fp: + json.dump(files_dict, fp) + + +def main(): + target_file = "targets.txt" + + parser = argparse.ArgumentParser( + prog="disease_analysis.py", + description="Performs differential gene expression analysis to find up and downregulated genes associated " + "with a disease. Significant genes are ones that have an FDR adjusted P-value < 0.05 and an " + "absolute fold-change greater than the threshold specified, default is 2", + epilog="For additional help, please post questions/issues in the MADRID GitHub repo at: " + "https://github.com/HelikarLab/MADRID or email babessell@gmail.com", + ) + parser.add_argument( + "-c", + "--config-file", + type=str, + required=True, + dest="config_file", + help="The path to the configuration file", + ) + parser.add_argument( + "-t", + "--context-name", + type=str, + required=True, + dest="context_name", + help="The type of context being used", + ) + parser.add_argument( + "-i", + "--taxon-id", + required=False, + default=9606, + dest="taxon_id", + help="BioDbNet taxon ID number, also accepts 'human', or 'mouse'", + ) + + args = parser.parse_args() + context_name = args.context_name + config_file = args.config_file + taxon_id = args.taxon_id + config_filepath = configs.config_dir / "disease" / config_file + + if not config_filepath.exists(): + raise FileNotFoundError(f"Config file not found at {config_filepath}") + if not config_filepath.suffix == ".xlsx": + raise ValueError("Config file must be in xlsx format!") + print("Config file is at ", config_filepath) + xl = pd.ExcelFile(config_filepath) + + # handle species alternative ids + if isinstance(taxon_id, str): + if taxon_id.upper() == "HUMAN" or taxon_id.upper() == "HOMO SAPIENS": + taxon_id = 9606 + elif taxon_id.upper() == "MOUSE" or taxon_id.upper() == "MUS MUSCULUS": + taxon_id = 10090 + else: + raise ValueError("taxon_id must be either an integer, or accepted string ('mouse', 'human')") + elif not isinstance(taxon_id, int): + raise ValueError("taxon_id must be either an integer, or accepted string ('mouse', 'human')") + + sheet_names = xl.sheet_names + for disease_name in sheet_names: + target_path = configs.data_dir / target_file + diff_exp_df, gse_id = get_rnaseq_diff_gene_exp(config_filepath, disease_name, context_name, taxon_id) + write_outputs(diff_exp_df, gse_id, context_name, disease_name, target_path) + + +if __name__ == "__main__": + main() diff --git a/main/como/knock_out_simulation.py b/main/como/knock_out_simulation.py new file mode 100644 index 00000000..9e4e578a --- /dev/null +++ b/main/como/knock_out_simulation.py @@ -0,0 +1,477 @@ +# ruff: noqa + +import argparse +import os +import re +import sys +from concurrent.futures import Future, ProcessPoolExecutor, ThreadPoolExecutor, as_completed +from dataclasses import dataclass +from pathlib import Path +from typing import Union + +import cobra +import numpy as np +import pandas as pd +from fast_bioservices import BioDBNet, Input, Output +from project import Config + +configs = Config() + + +@dataclass +class KnockoutResults: + model: cobra.Model + gene_ind2genes: set[str] + genes_with_metabolic_effects: list[str] + flux_solution: pd.DataFrame + flux_solution_ratios: pd.DataFrame + flux_solution_diffs: pd.DataFrame + + +def _perform_knockout( + model: cobra.Model, + gene_id: str, + reference_solution, +) -> tuple[str, pd.Series]: + """This function will perform a single gene knockout. It will be used in multiprocessing""" + with model: + gene: cobra.Gene = model.genes.get_by_id(gene_id) + gene.knock_out() + optimized_model: cobra.Solution = cobra.flux_analysis.moma(model, solution=reference_solution, linear=False) + return gene_id, optimized_model.fluxes + + +def knock_out_simulation( + model: cobra.Model, + inhibitors_filepath: Path, + drug_db: pd.DataFrame, + reference_flux_filepath: Union[str, Path, None], + test_all: bool, + pars_flag: bool, +) -> KnockoutResults: + reference_solution: cobra.Solution + if reference_flux_filepath is not None: + reference_flux_filepath: Path = Path(reference_flux_filepath) + if not reference_flux_filepath.exists(): + raise FileNotFoundError(f"Reference flux file not found at {reference_flux_filepath.as_posix()}") + reference_flux_df: pd.DataFrame = pd.read_csv(reference_flux_filepath) + if "rxn" not in reference_flux_df.columns or "flux" not in reference_flux_df.columns: + raise KeyError("Reference flux file must be a CSV file with the columns 'rxn' and 'flux' with the same number of rows as the number of reactions in the given context-specific model!") # fmt: skip + reference_flux_df.set_index("rxn", inplace=True) + reference_flux = reference_flux_df["flux"].squeeze() + reference_solution = cobra.core.solution.Solution(model.objective, "OPTIMAL", reference_flux) # fmt: skip + else: + reference_solution = cobra.flux_analysis.pfba(model) if pars_flag else model.optimize() + + drug_target_genes: pd.DataFrame + if inhibitors_filepath.exists(): + print(f"Inhibitors file found at: {inhibitors_filepath}") + drug_target_genes = pd.read_csv(inhibitors_filepath, sep="\t") + # dt_genes.rename(columns={0: "Gene ID"}, inplace=True) + drug_target_genes["Gene ID"] = drug_target_genes["Gene ID"].astype(str) + else: + # only keep inhibitors + drug_db = drug_db[drug_db["moa"].str.lower().str.contains("inhibitor")] + drug_target_genes = pd.DataFrame(columns=["Gene ID"]) + drug_target_genes["Gene ID"] = drug_db["Gene ID"].astype(str) + drug_target_genes.replace("-", pd.NA, inplace=True) + drug_target_genes.dropna(axis=0, inplace=True) + drug_target_genes.to_csv(inhibitors_filepath, header=True, sep="\t", index=False) + print(f"Inhibitors file written to: {inhibitors_filepath}") + + gene_ind2genes = set(x.id for x in model.genes) + dt_model = list(set(drug_target_genes["Gene ID"].tolist()).intersection(gene_ind2genes)) + print(f"{len(gene_ind2genes)} genes in model, {len(dt_model)} can be targeted by inhibitors") + + wild_type_model = cobra.flux_analysis.moma(model, solution=reference_solution).to_frame() + wild_type_model[abs(wild_type_model) < 1e-6] = 0.0 + + genes_with_metabolic_effects = [] + for id_ in dt_model: + gene: cobra.Gene = model.genes.get_by_id(id_) + for rxn in gene.reactions: + gene_reaction_rule = rxn.gene_reaction_rule + gene_ids = re.findall(r"\d+", gene_reaction_rule) + for gene_id in gene_ids: + boolval = "False" if gene_id == id_ else str(model.genes.get_by_id(gene_id).functional) + gene_reaction_rule = gene_reaction_rule.replace(gene_id, boolval, 1) + if not eval(gene_reaction_rule) or test_all: + genes_with_metabolic_effects.append(id_) + break + print(f"Found {len(genes_with_metabolic_effects)} genes with potentially-significant metabolic impacts") # fmt: skip + + futures: list[Future[tuple[str, pd.Series]]] = [] + with ProcessPoolExecutor(max_workers=os.cpu_count()) as executor: + for i, id_ in enumerate(genes_with_metabolic_effects, start=1): + future: Future = executor.submit(_perform_knockout, model, id_, reference_solution) + futures.append(future) + + gene_id: str + knock_out_flux: pd.Series + flux_solution: pd.DataFrame = pd.DataFrame() + for result in as_completed(futures): + gene_id, knock_out_flux = result.result() + flux_solution[gene_id] = knock_out_flux + + flux_solution[abs(flux_solution) < 1e-6] = 0.0 + flux_solution_ratios = flux_solution.div(wild_type_model["fluxes"], axis=0) + flux_solution_diffs = flux_solution.sub(wild_type_model["fluxes"], axis=0) + + return KnockoutResults( + model=model, + gene_ind2genes=gene_ind2genes, + genes_with_metabolic_effects=genes_with_metabolic_effects, + flux_solution=flux_solution, + flux_solution_ratios=flux_solution_ratios, + flux_solution_diffs=flux_solution_diffs, + ) + + +def create_gene_pairs( + datadir: Union[str, Path], + model: cobra.Model, + gene_ind2genes: set[str], + flux_solution: pd.DataFrame, + flux_solution_ratios: pd.DataFrame, + flux_solution_diffs: pd.DataFrame, + has_effects_gene: list[str], + disease_genes_filename: Path, +): + disease_genes_df: pd.DataFrame = pd.read_csv(str(os.path.join(datadir, disease_genes_filename))) + if len(disease_genes_df.columns) != 1: + raise ValueError(f"Expected 1 column in {disease_genes_filename}, got {len(disease_genes_df.columns)}") + + disease_genes_df.columns = ["Gene ID"] + disease_genes_df["Gene ID"] = disease_genes_df["Gene ID"].astype(str) + metabolic_disease_genes = set(disease_genes_df["Gene ID"]).intersection(gene_ind2genes) + + gene_df = pd.DataFrame(columns=["Gene ID", "Reaction ID"]) + for id_ in metabolic_disease_genes: + model_gene: cobra.Gene = model.genes.get_by_id(id_) + gene_reactions: list[str] = [rxn.id for rxn in model_gene.reactions] + gene_df = pd.concat([gene_df, pd.DataFrame({"Gene ID": id_, "Reaction ID": gene_reactions})], ignore_index=True) + gene_df.set_index("Reaction ID", drop=True, inplace=True) + + dag_rxn_flux_ratio: pd.DataFrame = flux_solution_ratios.loc[gene_df.index.tolist()] + dag_rxn_flux_diffs: pd.DataFrame = flux_solution_diffs.loc[gene_df.index.tolist()] + dag_rxn_flux_value: pd.DataFrame = flux_solution.loc[gene_df.index.tolist()] + gene_mat_out: list[pd.DataFrame] = [] + + for id_ in has_effects_gene: + pegene = pd.DataFrame() + pegene["Gene ID"] = gene_df["Gene ID"] + pegene["rxn_flux_ratio"] = dag_rxn_flux_ratio[id_] + pegene["Gene"] = id_ + + rxn_flux_diffs = dag_rxn_flux_diffs[id_] + rxn_flux_value = dag_rxn_flux_value[id_] + pegene = pegene.loc[(~pegene["rxn_flux_ratio"].isna()) & (abs(rxn_flux_diffs) + abs(rxn_flux_value) > 1e-8)] + pegene.index.name = "reaction" + gene_mat_out.append(pegene) + + gene_pairs = pd.concat(gene_mat_out, ignore_index=True) + return gene_pairs + + +def score_gene_pairs(gene_pairs, filename, input_reg): + p_model_genes = gene_pairs.Gene.unique() + d_score = pd.DataFrame([], columns=["score"]) + for p_gene in p_model_genes: + data_p = gene_pairs.loc[gene_pairs["Gene"] == p_gene].copy() + total_aff = data_p["Gene ID"].unique().size + n_aff_down = data_p.loc[abs(data_p["rxn_flux_ratio"]) < 0.9, "Gene ID"].unique().size + n_aff_up = data_p.loc[abs(data_p["rxn_flux_ratio"]) > 1.1, "Gene ID"].unique().size + if input_reg == "up": + d_s = (n_aff_down - n_aff_up) / total_aff + else: + d_s = (n_aff_up - n_aff_down) / total_aff + + d_score.at[p_gene, "score"] = d_s + + d_score.index.name = "Gene ID" + d_score.to_csv(configs.data_dir / filename) + return d_score + + +def score_gene_pairs_diff(gene_pairs, file_full_path): + p_model_genes = gene_pairs.Gene.unique() + d_score = pd.DataFrame([], columns=["score"]) + for p_gene in p_model_genes: + data_p = gene_pairs.loc[gene_pairs["Gene"] == p_gene].copy() + total_aff = data_p["Gene ID"].unique().size + n_aff_down = data_p.loc[data_p["rxn_flux_ratio"] < -1e-8, "Gene ID"].unique().size + n_aff_up = data_p.loc[data_p["rxn_flux_ratio"] > 1e-8, "Gene ID"].unique().size + d_s = (n_aff_down - n_aff_up) / total_aff + d_score.at[p_gene, "score"] = d_s + + d_score.index.name = "Gene" + d_score.to_csv(file_full_path) + return d_score + + +def repurposing_hub_preproc(drug_info_filepath: Path, biodbnet: BioDBNet): + drug_info_df: pd.DataFrame = pd.read_csv(drug_info_filepath, sep="\t") + drug_info_df["target"] = drug_info_df["target"].str.split("|").explode().reset_index(drop=True) + drug_info_df = ( + drug_info_df.drop(columns=["disease_area", "indication"]) + .rename(columns={"pert_iname": "name", "clinical_phase": "phase"}) + .dropna(subset=["target", "moa"]) + ) + # for index, row in drug_db.iterrows(): + # if pd.isnull(row["target"]): + # continue + # for target in row["target"].split("|"): + # drug_db_new = pd.concat( + # [ + # drug_db_new, + # pd.DataFrame( + # [ + # { + # "Name": row["pert_iname"], + # "MOA": row["moa"], + # "Target": target.strip(), + # "Phase": row["clinical_phase"], + # } + # ] + # ), + # ], + # ignore_index=True, + # ) + # drug_db_new.reset_index(inplace=True) + entrez_ids = biodbnet.db2db( + input_values=drug_info_df["target"].tolist(), + input_db=Input.GENE_SYMBOL, + output_db=Output.GENE_ID, + ) + entrez_ids.rename(columns={"Gene Symbol": "target"}, inplace=True) + drug_info_df = pd.merge(drug_info_df, entrez_ids, on="target") + # entrez_ids.reset_index(drop=False, inplace=True) + # drug_db_new["ENTREZ_GENE_ID"] = entrez_ids["Gene ID"] + # drug_db_new = drug_db_new[["Name", "MOA", "Target", "ENTREZ_GENE_ID", "Phase"]] + return drug_info_df + + +def drug_repurposing(drug_db: pd.DataFrame, perturbation_score: pd.DataFrame, biodbnet: BioDBNet): + perturbation_score["Gene ID"] = perturbation_score["Gene ID"].astype(str) + + conversion = biodbnet.db2db( + input_values=perturbation_score["Gene ID"].tolist(), + input_db=Input.GENE_ID, + output_db=[Output.GENE_SYMBOL], + ) + + perturbation_score = pd.merge(perturbation_score, conversion, on="Gene ID", how="left") + # d_score.set_index("Gene", inplace=True) + # d_score["Gene Symbol"] = d_score_gene_sym["Gene Symbol"] + # d_score.reset_index(drop=False, inplace=True) + drug_scores = pd.DataFrame() + for index, row in perturbation_score.iterrows(): + target = row["Gene Symbol"] + drugs = drug_db.loc[drug_db["target"] == target, :].copy() # Use `.copy()` to prevent `SettingWithCopyWarning` + drugs["score"] = row["score"] + drug_scores = pd.concat([drug_scores, drugs], ignore_index=True) + + drug_scores.drop_duplicates(inplace=True) + drug_scores = drug_scores[drug_scores["moa"].str.lower().str.contains("inhibitor")] + return drug_scores + + +def main(argv): + parser = argparse.ArgumentParser( + prog="knock_out_simulation.py", + description="This script is responsible for mapping drug targets in metabolic models, performing knock out simulations, and comparing simulation results with disease genes. It also identified drug targets and repurposable drugs.", + epilog="For additional help, please post questions/issues in the MADRID GitHub repo at https://github.com/HelikarLab/COMO", + ) + parser.add_argument( + "-m", + "--context-model", + type=str, + required=True, + dest="model", + help="The context-specific model file, (must be .mat, .xml, or .json", + ) + parser.add_argument( + "-c", + "--context-name", + type=str, + required=True, + dest="context", + help="Name of context, tissue, cell-type, etc", + ) + parser.add_argument( + "-d", + "--disease-name", + type=str, + required=True, + dest="disease", + help="Name of disease", + ) + parser.add_argument( + "-up", + "--disease-up", + type=str, + required=True, + dest="disease_up", + help="The name of the disease up-regulated file", + ) + parser.add_argument( + "-dn", + "--disease-down", + type=str, + required=True, + dest="disease_down", + help="The name of the disease down-regulated file", + ) + parser.add_argument( + "-r", + "--raw-drug-file", + type=str, + required=True, + dest="raw_drug_file", + help="The name of the raw drug file", + ) + parser.add_argument( + "-f", + "--reference-flux-file", + type=str if ("--reference-flux-file" in argv or "-f" in argv) else type(None), + required=False, + default=None, + dest="ref_flux_file", + help="The name of the reference flux file", + ) + parser.add_argument( + "-a", + "--test-all", + action="store_true", + required=False, + default=False, + dest="test_all", + help="Test all genes, even ones predicted to have little no effect.", + ) + parser.add_argument( + "-p", + "--parsimonious", + action="store_true", + required=False, + default=False, + dest="pars_flag", + help="Use parsimonious FBA for optimal reference solution (only if not providing flux file)", + ) + parser.add_argument( + "-s", + "--solver", + type=str, + required=False, + default="gurobi", + dest="solver", + help="The solver to use for FBA. Options are: gurobi or glpk", + ) + + args = parser.parse_args() + tissue_spec_model_file = Path(args.model) + context = args.context + disease = args.disease + disease_up_file = Path(args.disease_up) + disease_down_file = Path(args.disease_down) + raw_drug_filename = args.raw_drug_file + ref_flux_file = args.ref_flux_file + test_all = args.test_all + pars_flag = args.pars_flag + solver = args.solver + + output_dir = Path(configs.data_dir, "results", context, disease) + inhibitors_filepath = Path(output_dir, f"{context}_{disease}_inhibitors.tsv") + biodbnet = BioDBNet(cache=False) + thread_pool = ThreadPoolExecutor(max_workers=1) + + print(f"Output directory: '{output_dir.as_posix()}'") + print(f"Tissue Specific Model file is at: {tissue_spec_model_file.as_posix()}") + print(f"Tissue specific inhibitors is at: {inhibitors_filepath.as_posix()}") + + if not tissue_spec_model_file.exists(): + raise FileNotFoundError(f"Model file not found at {tissue_spec_model_file.as_posix()}") + elif tissue_spec_model_file.suffix == ".mat": + future = thread_pool.submit(cobra.io.load_matlab_model, infile_path=tissue_spec_model_file.as_posix()) # type: ignore + elif tissue_spec_model_file.suffix in (".xml", ".sbml"): + future = thread_pool.submit(cobra.io.read_sbml_model, filename=tissue_spec_model_file.as_posix()) # type: ignore + elif tissue_spec_model_file.suffix == ".json": + future = thread_pool.submit(cobra.io.load_json_model, filename=tissue_spec_model_file.as_posix()) # type: ignore + else: + raise NameError("Reference model must be in 'mat', 'xml', 'sbml', or 'json' format.") + + raw_drug_filepath = Path(configs.data_dir, raw_drug_filename) + reformatted_drug_filepath = raw_drug_filepath.with_stem(f"{raw_drug_filepath.stem}_processed") + drug_info_df: pd.DataFrame + if reformatted_drug_filepath.exists(): + print(f"Found preprocessed Repurposing Hub tsv file at: {reformatted_drug_filepath}") + drug_info_df = pd.read_csv(reformatted_drug_filepath, sep="\t") + else: + print("Preprocessing raw Repurposing Hub DB file...") + drug_info_df = repurposing_hub_preproc(drug_info_filepath=raw_drug_filepath, biodbnet=biodbnet) + drug_info_df.to_csv(reformatted_drug_filepath, index=False, sep="\t") + print(f"Preprocessed Repurposing Hub tsv file written to: {reformatted_drug_filepath.as_posix()}") + + cobra_model: cobra.Model = future.result() + cobra_model.solver = solver + thread_pool.shutdown() + + knockout_results = knock_out_simulation( + model=cobra_model, + inhibitors_filepath=inhibitors_filepath, + drug_db=drug_info_df, + reference_flux_filepath=ref_flux_file, + test_all=test_all, + pars_flag=pars_flag, + ) + + knockout_results.flux_solution_diffs.to_csv(output_dir / "flux_diffs_KO.csv") + knockout_results.flux_solution_ratios.to_csv(output_dir / "flux_ratios_KO.csv") + + gene_pairs_down = create_gene_pairs( + configs.data_dir, + knockout_results.model, + knockout_results.gene_ind2genes, + knockout_results.flux_solution, + knockout_results.flux_solution_ratios, + knockout_results.flux_solution_diffs, + knockout_results.genes_with_metabolic_effects, + disease_genes_filename=disease_down_file, + ) + gene_pairs_down.to_csv(os.path.join(output_dir, f"{context}_Gene_Pairs_Inhi_Fratio_DOWN.txt"), index=False) + + gene_pairs_up = create_gene_pairs( + configs.data_dir, + knockout_results.model, + knockout_results.gene_ind2genes, + knockout_results.flux_solution, + knockout_results.flux_solution_ratios, + knockout_results.flux_solution_diffs, + knockout_results.genes_with_metabolic_effects, + disease_genes_filename=disease_up_file, + ) + gene_pairs_up.to_csv(os.path.join(output_dir, f"{context}_Gene_Pairs_Inhi_Fratio_UP.txt"), index=False) + + d_score_down = score_gene_pairs( + gene_pairs_down, + os.path.join(output_dir, f"{context}_d_score_DOWN.csv"), + input_reg="down", + ) + d_score_up = score_gene_pairs( + gene_pairs_up, + os.path.join(output_dir, f"{context}_d_score_UP.csv"), + input_reg="up", + ) + perturbation_score: pd.DataFrame = (d_score_up + d_score_down).sort_values(by="score", ascending=False) + perturbation_score.to_csv(os.path.join(output_dir, f"{context}_d_score.csv")) + perturbation_score.reset_index(drop=False, inplace=True) + + drug_score = drug_repurposing(drug_db=drug_info_df, perturbation_score=perturbation_score, biodbnet=biodbnet) + drug_score_file = os.path.join(output_dir, f"{context}_drug_score.csv") + drug_score.to_csv(drug_score_file, index=False) + print(f"Gene D score mapped to repurposing drugs saved to {drug_score_file}") + + print(f"\nFinished {disease}!") + + +if __name__ == "__main__": + main(sys.argv[1:]) diff --git a/main/como/merge_xomics.py b/main/como/merge_xomics.py new file mode 100644 index 00000000..71ddd776 --- /dev/null +++ b/main/como/merge_xomics.py @@ -0,0 +1,731 @@ +from __future__ import annotations + +import asyncio +import sys +from io import TextIOWrapper +from pathlib import Path + +import numpy as np +import pandas as pd +from fast_bioservices.biothings.mygene import MyGene +from loguru import logger + +from como.combine_distributions import ( + _begin_combining_distributions, +) +from como.data_types import ( + AdjustmentMethod, + BatchNames, + InputMatrices, + LogLevel, + OutputCombinedSourceFilepath, + RNAType, + SourceTypes, + SourceWeights, + _BatchEntry, +) +from como.project import Config +from como.utils import _log_and_raise_error, get_missing_gene_data, read_file, return_placeholder_data, set_up_logging + + +class _MergedHeaderNames: + TRNASEQ = "trnaseq" + MRNASEQ = "mrnaseq" + SCRNASEQ = "scrnaseq" + PROTEOMICS = "prote" + + +class _ExpressedHeaderNames: + TRNASEQ = f"{_MergedHeaderNames.TRNASEQ}_exp" + MRNASEQ = f"{_MergedHeaderNames.MRNASEQ}_exp" + SCRNASEQ = f"{_MergedHeaderNames.SCRNASEQ}_exp" + PROTEOMICS = f"{_MergedHeaderNames.PROTEOMICS}_exp" + + +class _HighExpressionHeaderNames: + TRNASEQ = f"{_MergedHeaderNames.TRNASEQ}_high" + MRNASEQ = f"{_MergedHeaderNames.MRNASEQ}_high" + SCRNASEQ = f"{_MergedHeaderNames.SCRNASEQ}_high" + PROTEOMICS = f"{_MergedHeaderNames.PROTEOMICS}_high" + + +# TODO: If function is no longer needed, remove? +def _load_rnaseq_tests(filename, context_name, prep_method: RNAType) -> tuple[str, pd.DataFrame]: + """Load rnaseq results. + + Args: + filename: Name of the file to load + context_name: Name of the context (e.g., tissue or cell type) + prep_method: The RNA-seq library preparation method (e.g., mRNA, total RNA, single-cell RNA) + + Returns: + A tuple containing the context name and the loaded DataFrame± + """ + logger.debug(f"Loading data for context '{context_name}' using preparation method '{prep_method.value}'") + config = Config() + + def load_dummy_dict(): + df = return_placeholder_data() + return "dummy", df + + if not filename or filename == "None": # not using this data type, use empty dummy data matrix + return load_dummy_dict() + + inquiry_full_path = config.data_dir / "config_sheets" / filename + if not inquiry_full_path.exists(): + _log_and_raise_error( + f"Config file not found at {inquiry_full_path}", + error=FileNotFoundError, + level=LogLevel.ERROR, + ) + + match prep_method: + case RNAType.TRNA: + filename = f"{RNAType.TRNA.value}_{context_name}.csv" + case RNAType.MRNA: + filename = f"{RNAType.MRNA.value}_{context_name}.csv" + case RNAType.SCRNA: + filename = f"{RNAType.SCRNA.value}_{context_name}.csv" + case _: + _log_and_raise_error( + f"Unsupported RNA-seq library type: {prep_method.value}. Must be one of {', '.join(RNAType)}.", + error=ValueError, + level=LogLevel.ERROR, + ) + + save_filepath = config.result_dir / context_name / prep_method.value / filename + if save_filepath.exists(): + logger.debug(f"Loading RNA-seq data from: {save_filepath}") + data = pd.read_csv(save_filepath, index_col="entrez_gene_id") + logger.success(f"Successfully loaded RNA-seq data from: {save_filepath}") + return context_name, data + + else: + logger.warning( + f"'{prep_method.value}' gene expression file for '{context_name}' was not found at '{save_filepath}'. " + f"If this is not intentional, please fix the filename to match '{save_filepath}'." + ) + return load_dummy_dict() + + +# Merge Output +def _merge_logical_table(df: pd.DataFrame): + """Merge rows of Logical Table belonging to the same entrez_gene_id. + + Args: + df: Pandas dataframe containing the logical table + + Returns: + pandas dataframe of merged table + """ + # step 1: get all plural ENTREZ_GENE_IDs in the input table, extract unique IDs + df.dropna(subset=["entrez_gene_id"], inplace=True) + df["entrez_gene_id"] = df["entrez_gene_id"].astype(str).str.replace(" /// ", "//").astype(str) + + id_list: list[str] = df.loc[~df["entrez_gene_id"].str.contains("//"), "entrez_gene_id"].tolist() # Collect "single" ids, like "123" + multiple_entrez_ids: list[str] = df.loc[ + df["entrez_gene_id"].str.contains("//"), "entrez_gene_id" + ].tolist() # Collect "double" ids, like "123//456" + + for i in multiple_entrez_ids: + ids = i.split("//") + id_list.extend(ids) + logger.trace(f"Processing multiple IDs {ids} for {i}") + + duplicate_rows = pd.DataFrame([]) + for j in ids: + rows = df.loc[df["entrez_gene_id"] == i].copy() + rows["entrez_gene_id"] = j + duplicate_rows = pd.concat([duplicate_rows, rows], axis=0) + + df = pd.concat([df, pd.DataFrame(duplicate_rows)], axis=0, ignore_index=True) + df.drop(df[df["entrez_gene_id"] == i].index, inplace=True) + logger.trace(f"Shape after merging duplicated rows: {df.shape}") + + full_entrez_id_sets: set[str] = set() + entrez_dups_list: list[list[str]] = [] + multi_entrez_index = list(range(len(multiple_entrez_ids))) + + logger.trace("Starting to merge multiple entrez IDs") + temp_multi_entrez_index = multi_entrez_index.copy() + for i in range(len(multiple_entrez_ids)): + if i not in multi_entrez_index: + continue + + logger.trace(f"Iterating through multi-entrez ids, index {i}") + + set1 = set(multiple_entrez_ids[i].split("//")) + temp_multi_entrez_index.remove(i) + + for j in multi_entrez_index: + set2 = set(multiple_entrez_ids[j].split("//")) + intersect = set1.intersection(set2) + if bool(intersect): + set1 = set1.union(set2) + temp_multi_entrez_index.remove(j) + + sortlist = list(set1) + sortlist.sort(key=int) + new_entrez_id = " /// ".join(sortlist) + full_entrez_id_sets.add(new_entrez_id) + + multi_entrez_index = temp_multi_entrez_index.copy() + + logger.debug(f"Finished merging multiple entrez IDs, found {len(full_entrez_id_sets)} sets") + entrez_dups_list.extend(i.split(" /// ") for i in full_entrez_id_sets) + entrez_dups_dict = dict(zip(full_entrez_id_sets, entrez_dups_list, strict=True)) + + logger.trace("Replacing IDs in dataframe") + for merged_entrez_id, entrez_dups_list in entrez_dups_dict.items(): + df["entrez_gene_id"].replace(to_replace=entrez_dups_list, value=merged_entrez_id, inplace=True) + + df = df.fillna(-1).groupby(level=0).max() + df.replace(-1, np.nan, inplace=True) + logger.trace(f"Shape after merging: {df.shape}") + + # TODO: Test if this is working properly + """ + There seems to be an error when running Step 2.1 in the pipeline.ipynb file + The commented-out return statement tries to return the df_output dataframe values as integers, but NaN values exist + Because of this, it is unable to do so. + If we change this to simply output the database, the line "np.where(posratio >= top_proportion . . ." (line ~162) + Fails because it is comparing floats and strings + + I am unsure what to do in this situation + """ + # return df_output.astype(int) + return df + + +async def _get_transcriptmoic_details(merged_df: pd.DataFrame, taxon_id: int) -> pd.DataFrame: + """Get details of transcriptomic data. + + This function will get the following details of transcriptomic data: + - Gene Symbol + - Gene Name + - entrez_gene_id + + The resulting dataframe will have its columns created in the order listed above + It will return a pandas dataframe with this information + + Args: + merged_df: A dataframe containing all active transcriptomic and proteomic genes + taxon_id: The NCBI taxonomy ID of the organism + + Returns: + A dataframe with the above-listed columns + """ + # If _ExpressedHeaderNames.PROTEOMICS.value is in the dataframe, lower the required expression by 1 + # We are only trying to get details for transcriptomic data + logger.debug("Obtaining transcriptomic details") + transcriptomic_df: pd.DataFrame = merged_df.copy() + if _ExpressedHeaderNames.PROTEOMICS in merged_df.columns: + logger.trace("Proteomic data found, modifying required and total expression values") + # Get the number of sources required for a gene to be marked "expressed" + required_expression = merged_df["required"].iloc[0] + + # Subtract 1 from merged_df["TotalExpressed"] if the current value is greater than or equal to 1 + # This is done to take into account the removal of proteomic expression + merged_df["total_expressed"] = merged_df["total_expressed"].apply(lambda x: x - 1 if x >= 1 else x) + + # Subtract required_expression by 1 if it is greater than 1 + if required_expression > 1: + required_expression -= 1 + + transcriptomic_df: pd.DataFrame = merged_df.drop( + columns=[ + _ExpressedHeaderNames.PROTEOMICS, + _HighExpressionHeaderNames.PROTEOMICS, + ], + inplace=False, + ) + logger.trace(f"Modified transcriptomic dataframe: {transcriptomic_df.shape}") + + # Must recalculate TotalExpressed because proteomic data was removed + # If the TotalExpressed column is less than the Required column, set active to 1, otherwise set it to 0 + transcriptomic_df.loc[ + transcriptomic_df["total_expressed"] >= transcriptomic_df["required"], + "active", + ] = 1 + + my_gene = MyGene() + gene_details: pd.DataFrame = pd.DataFrame( + data=pd.NA, + columns=["entrez_gene_id", "gene_symbol", "description", "gene_type"], + index=list(range(len(transcriptomic_df))), + ) + logger.trace(f"Querying MyGene for details on {len(transcriptomic_df)} genes") + for i, detail in enumerate( + await my_gene.query( + items=transcriptomic_df["entrez_gene_id"].tolist(), + taxon=taxon_id, + scopes="entrezgene", + ) + ): + gene_details.at[i, "entrez_gene_id"] = detail["entrezgene"] + gene_details.at[i, "gene_symbol"] = detail["symbol"] + gene_details.at[i, "description"] = detail["name"] + gene_details.at[i, "gene_type"] = detail["type_of_gene"] + + logger.debug("Finished obtaining transcriptomic details") + return gene_details + + +async def _merge_xomics( + context_name: str, + expression_requirement: int, + trna_boolean_matrix: pd.DataFrame | None, + mrna_boolean_matrix: pd.DataFrame | None, + scrna_boolean_matrix: pd.DataFrame | None, + proteomic_boolean_matrix: pd.DataFrame | None, + output_merged_filepath: Path, + output_gene_activity_filepath: Path, + output_transcriptomic_details_filepath: Path, + taxon_id: int, + force_activate_high_confidence: bool = True, + adjust_for_missing_sources: bool = False, +): + logger.debug(f"Starting to merge data sources for context '{context_name}'") + expression_list: list[str] = [] + high_confidence_list: list[str] = [] + merge_data: pd.DataFrame = pd.DataFrame() + + for matrix, expressed_sourcetype, high_expressed_sourcetype in ( + (trna_boolean_matrix, _ExpressedHeaderNames.TRNASEQ, _HighExpressionHeaderNames.TRNASEQ), + (mrna_boolean_matrix, _ExpressedHeaderNames.MRNASEQ, _HighExpressionHeaderNames.MRNASEQ), + (scrna_boolean_matrix, _ExpressedHeaderNames.SCRNASEQ, _HighExpressionHeaderNames.SCRNASEQ), + (proteomic_boolean_matrix, _ExpressedHeaderNames.PROTEOMICS, _HighExpressionHeaderNames.PROTEOMICS), + ): + if matrix is None: + logger.trace(f"Skipping {expressed_sourcetype} because it's matrix does not exist") + continue + + matrix: pd.DataFrame # re-define type to assist in type hinting for IDEs + expression_list.append(expressed_sourcetype) + high_confidence_list.append(high_expressed_sourcetype) + matrix.rename(columns={"expressed": expressed_sourcetype, "high": high_expressed_sourcetype}, inplace=True) + merge_data = matrix if merge_data.empty else merge_data.merge(matrix, on="entrez_gene_id", how="outer") + + logger.trace(f"Shape of merged data before merging logical tables: {merge_data.shape}") + if merge_data.empty: + logger.warning(f"No data is available for the '{context_name}' context. If this is intentional, ignore this error.") + return {} + + merge_data = _merge_logical_table(merge_data) + logger.debug(f"Shape of merged data after merging logical table: {merge_data.shape}") + num_sources = len(expression_list) + merge_data["active"] = 0 + merge_data["required"] = 0 + + logger.trace(f"Number of data sources: {num_sources}") + if adjust_for_missing_sources: # Subtract 1 from requirement per missing source + logger.trace("Adjusting for missing data sources") + merge_data.loc[:, "required"] = merge_data[expression_list].apply( + lambda x: expression_requirement - (num_sources - x.count()) if (expression_requirement - (num_sources - x.count()) > 0) else 1, + axis=1, + ) + else: # Do not adjust for missing sources + logger.trace("Not adjusting for missing data sources") + merge_data.loc[:, "required"] = merge_data[expression_list].apply( + lambda x: expression_requirement if (expression_requirement - (num_sources - x.count()) > 0) else 1, axis=1 + ) + logger.trace("Created expression requirement column") + + # Count the number of sources each gene is active in + # set to active in final output if we meet the adjusted expression requirement + merge_data["total_expressed"] = merge_data[expression_list].sum(axis=1) + merge_data.loc[merge_data["total_expressed"] >= merge_data["required"], "active"] = 1 + logger.trace("Created total expression requirement column") + + if force_activate_high_confidence: # If a gene is high-confidence in at least 1 data source, set it to active + logger.trace("Forcing high confidence genes") + merge_data.loc[merge_data[high_confidence_list].sum(axis=1) > 0, "active"] = 1 + + merge_data.dropna(inplace=True) + merge_data.to_csv(output_merged_filepath, index=False) + logger.success(f"Saved merged data to {output_merged_filepath}") + + logger.debug(f"Generating transcriptomic details using {output_merged_filepath}") + transcriptomic_details = await _get_transcriptmoic_details(merge_data, taxon_id=taxon_id) + logger.debug(f"Saving transcriptomic details to {output_transcriptomic_details_filepath}") + transcriptomic_details.dropna(inplace=True) + transcriptomic_details.to_csv(output_transcriptomic_details_filepath, index=False) + logger.success(f"Saved transcriptomic details to {output_transcriptomic_details_filepath}") + return {context_name: output_gene_activity_filepath.as_posix()} + + +async def _update_missing_data(input_matrices: InputMatrices, taxon_id: int) -> InputMatrices: + logger.trace("Updating missing genomic data") + matrix_keys: dict[str, list[pd.DataFrame]] = { + "trna": [input_matrices.trna], + "mrna": [input_matrices.mrna], + "scrna": [input_matrices.scrna], + "proteomics": [input_matrices.proteomics], + } + logger.trace(f"Gathering missing data for data sources: {','.join(key for key in matrix_keys if key is not None)}") + # fmt: off + results = await asyncio.gather( + *[ + # Using 'is not None' is required because the truth value of a Dataframe is ambiguous + get_missing_gene_data(values=input_matrices.trna, taxon_id=taxon_id) if input_matrices.trna is not None else asyncio.sleep(0), + get_missing_gene_data(values=input_matrices.mrna, taxon_id=taxon_id) if input_matrices.mrna is not None else asyncio.sleep(0), + get_missing_gene_data(values=input_matrices.scrna, taxon_id=taxon_id) if input_matrices.scrna is not None else asyncio.sleep(0), + get_missing_gene_data(values=input_matrices.proteomics, taxon_id=taxon_id) if input_matrices.proteomics is not None else asyncio.sleep(0), + ] + ) + # fmt: on + for i, key in enumerate(matrix_keys): + matrix_keys[key].append(results[i]) + + for matrix_name, (matrix, conversion) in matrix_keys.items(): + matrix: pd.DataFrame + if matrix is not None: + # fmt: off + existing_data = ( + "gene_symbol" if "gene_symbol" in matrix + else "entrez_gene_id" if "entrez_gene_id" in matrix + else "ensembl_gene_id" + ) + # fmt: on + logger.trace(f"Merging conversion data for {matrix_name}, existing id column is: {existing_data}") + input_matrices[matrix_name] = ( + input_matrices[matrix_name].merge(conversion, how="left", on=[existing_data]).dropna().reset_index(drop=True) + ) + + logger.debug("Updated missing genomic data") + return input_matrices + + +async def _process( + *, + context_name: str, + input_matrices: InputMatrices, + boolean_matrices: InputMatrices, + batch_names: BatchNames, + source_weights: SourceWeights, + taxon_id: int, + minimum_source_expression: int, + expression_requirement: int, + weighted_z_floor: int, + weighted_z_ceiling: int, + adjust_method: AdjustmentMethod, + merge_zfpkm_distribution: bool, + force_activate_high_confidence: bool, + adjust_for_missing_sources: bool, + output_merge_activity_filepath: Path, + output_transcriptomic_details_filepath: Path, + output_activity_filepaths: OutputCombinedSourceFilepath, + output_final_model_scores_filepath: Path, + output_figure_dirpath: Path | None, +): + """Merge different data sources for each context type.""" + logger.trace( + f"Settings: Min Expression: {minimum_source_expression}, Expression Requirement: {expression_requirement}, " + f"Weighted Z-Score Floor: {weighted_z_floor}, Weighted Z-Score Ceiling: {weighted_z_ceiling}, " + f"Adjust Method: {adjust_method.value}, Merge Z-Scores: {merge_zfpkm_distribution}, " + f"Force High Confidence: {force_activate_high_confidence}, Adjust for Missing: {adjust_for_missing_sources}" + ) + + # Collect missing genomic data for each of the input items in asynchronous parallel + input_matrices = await _update_missing_data(input_matrices, taxon_id) + logger.trace("Missing data updated") + + if merge_zfpkm_distribution: + logger.trace("Merging Z-Scores") + _begin_combining_distributions( + context_name=context_name, + input_matrices=input_matrices, + batch_names=batch_names, + source_weights=source_weights, + output_filepaths=output_activity_filepaths, + output_figure_dirpath=output_figure_dirpath, + output_final_model_scores=output_final_model_scores_filepath, + weighted_z_floor=weighted_z_floor, + weighted_z_ceiling=weighted_z_ceiling, + ) + logger.trace("Finished merging Z-Scores") + + # the more data sources available, the higher the expression requirement for the gene + num_sources = sum(1 for source in input_matrices if source is not None) + if adjust_method == AdjustmentMethod.PROGRESSIVE: + adjusted_expression_requirement = (num_sources - minimum_source_expression) + expression_requirement + # the more data sources available, the lower the expression requirement for the gene + elif adjust_method == AdjustmentMethod.REGRESSIVE: + # we use a hardcoded 4 here because that is the maximum number of contexts available + # (trna, mrna, scrna, and proteomics is 4 sources) + adjusted_expression_requirement = expression_requirement - (4 - num_sources) + elif adjust_method == AdjustmentMethod.FLAT: + adjusted_expression_requirement = expression_requirement + logger.debug(f"Adjusted expression requirement: {adjusted_expression_requirement}") + + if adjusted_expression_requirement != expression_requirement: + logger.debug( + f"Expression requirement of '{expression_requirement}' adjusted to " + f"'{adjusted_expression_requirement}' using '{adjust_method.value}' adjustment method " + f"for '{context_name}'." + ) + + if adjusted_expression_requirement > num_sources: + logger.warning( + f"Expression requirement for {context_name} was calculated to be greater " + f"than max number of input data sources. " + f"Will be force changed to {num_sources} to prevent output from having 0 active genes. " + f"Consider lowering the expression requirement or changing the adjustment method." + ) + adjusted_expression_requirement = num_sources + + if adjusted_expression_requirement < 1: # never allow expression requirement to be less than one + logger.warning( + f"Expression requirement for {context_name} was calculated to be less than 1. " + "Will be changed to 1 to prevent output from having 0 active genes. " + ) + adjusted_expression_requirement = 1 + + logger.debug(f"Final Expression Requirement: {adjusted_expression_requirement}") + await _merge_xomics( + context_name=context_name, + expression_requirement=adjusted_expression_requirement, + trna_boolean_matrix=boolean_matrices.trna, + mrna_boolean_matrix=boolean_matrices.mrna, + scrna_boolean_matrix=boolean_matrices.scrna, + proteomic_boolean_matrix=boolean_matrices.proteomics, + output_merged_filepath=output_merge_activity_filepath, + output_gene_activity_filepath=output_final_model_scores_filepath, + output_transcriptomic_details_filepath=output_transcriptomic_details_filepath, + taxon_id=taxon_id, + force_activate_high_confidence=force_activate_high_confidence, + adjust_for_missing_sources=adjust_for_missing_sources, + ) + + +def _build_batches( + trna_metadata: pd.DataFrame | None, + mrna_metadata: pd.DataFrame | None, + scrna_metadata: pd.DataFrame | None, + proteomic_metadata: pd.DataFrame | None, +) -> BatchNames: + batch_names = BatchNames(**{source.name.lower(): [] for source in SourceTypes}) + for source, metadata in zip(SourceTypes, [trna_metadata, mrna_metadata, scrna_metadata, proteomic_metadata], strict=True): + source: SourceTypes + metadata: pd.DataFrame + if metadata is None: + logger.trace(f"Metadata for source '{source.value}' is None, skipping") + continue + + metadata: pd.DataFrame # Re-assign type to assist in type hinting + for batch_num, study in enumerate(sorted(metadata["study"].unique()), start=1): + study_sample_names = metadata[metadata["study"] == study]["sample_name"].tolist() + batch_names[source.value].append(_BatchEntry(batch_num=batch_num, sample_names=study_sample_names)) + logger.debug(f"Found {len(study_sample_names)} sample names for study '{study}', batch number {batch_num}") + return batch_names + + +def _validate_source_arguments( + source: SourceTypes, + *args, +) -> None: + """Validate arguments for each source are valid. + + If at least one input item is provided, validate that all required items are also present. + + :param matrix_or_filepath: The gene count matrix or filepath + :param boolean_matrix_or_filepath: The boolean matrix of gene activities + :param metadata_filepath_or_df: Dataframe or filepath to sample metadata + :param output_activity_filepath: Output filepath location + :param source: Source type + + """ + if any(i for i in args) and not all(i for i in args): + _log_and_raise_error( + f"Must specify all or none of '{source.value}' arguments", + error=ValueError, + level=LogLevel.ERROR, + ) + + +async def merge_xomics( # noqa: C901 + context_name: str, + output_merge_activity_filepath: Path, + output_transcriptomic_details_filepath: Path, + output_final_model_scores_filepath: Path, + output_figure_dirpath: Path | None, + taxon_id: int, + trna_matrix_or_filepath: Path | pd.DataFrame | None = None, + mrna_matrix_or_filepath: Path | pd.DataFrame | None = None, + scrna_matrix_or_filepath: Path | pd.DataFrame | None = None, + proteomic_matrix_or_filepath: Path | pd.DataFrame | None = None, + trna_boolean_matrix_or_filepath: Path | pd.DataFrame | None = None, + mrna_boolean_matrix_or_filepath: Path | pd.DataFrame | None = None, + scrna_boolean_matrix_or_filepath: Path | pd.DataFrame | None = None, + proteomic_boolean_matrix_or_filepath: Path | pd.DataFrame | None = None, + trna_metadata_filepath_or_df: Path | pd.DataFrame | None = None, + mrna_metadata_filepath_or_df: Path | pd.DataFrame | None = None, + scrna_metadata_filepath_or_df: Path | pd.DataFrame | None = None, + proteomic_metadata_filepath_or_df: Path | pd.DataFrame | None = None, + output_trna_activity_filepath: Path | None = None, + output_mrna_activity_filepath: Path | None = None, + output_scrna_activity_filepath: Path | None = None, + output_proteomic_activity_filepath: Path | None = None, + trna_weight: int = 1, + mrna_weight: int = 1, + scrna_weight: int = 1, + proteomic_weight: int = 2, + minimum_source_expression: int = 1, + expression_requirement: int | None = None, + adjust_method: AdjustmentMethod = AdjustmentMethod.FLAT, + force_activate_high_confidence: bool = False, + adjust_for_na: bool = False, + merge_zfpkm_distribution: bool = False, + weighted_z_floor: int = -6, + weighted_z_ceiling: int = 6, + log_level: LogLevel = LogLevel.INFO, + log_location: str | TextIOWrapper = sys.stderr, +): + """Merge expression tables of multiple sources (RNA-seq, proteomics) into one.""" + set_up_logging(level=log_level, location=log_location) + logger.info(f"Starting to merge all omics data for context: '{context_name}'") + + # fmt: off + source_data = { + SourceTypes.TRNA: (trna_matrix_or_filepath, trna_boolean_matrix_or_filepath, trna_metadata_filepath_or_df, output_trna_activity_filepath), + SourceTypes.MRNA: (mrna_matrix_or_filepath, mrna_boolean_matrix_or_filepath, mrna_metadata_filepath_or_df, output_mrna_activity_filepath), + SourceTypes.SCRNA: (scrna_matrix_or_filepath, scrna_boolean_matrix_or_filepath, scrna_metadata_filepath_or_df, output_scrna_activity_filepath), # noqa: E501 + SourceTypes.PROTEOMICS: (proteomic_matrix_or_filepath, proteomic_boolean_matrix_or_filepath, proteomic_metadata_filepath_or_df, output_proteomic_activity_filepath), # noqa: E501 + } + # fmt: on + for source in source_data: + _validate_source_arguments(source, *source_data[source]) + + if all( + file is None + for file in ( + trna_matrix_or_filepath, + mrna_matrix_or_filepath, + scrna_matrix_or_filepath, + proteomic_matrix_or_filepath, + ) + ): + _log_and_raise_error("No data was passed!", error=ValueError, level=LogLevel.ERROR) + + if adjust_method not in AdjustmentMethod: + _log_and_raise_error( + f"Adjustment method must be one of {AdjustmentMethod}; got: {adjust_method}", + error=ValueError, + level=LogLevel.ERROR, + ) + + if not expression_requirement or expression_requirement < 1: + logger.warning(f"Expression requirement must be at least 1! Setting to the minimum of 1 now. Got: {expression_requirement}") + expression_requirement = 1 + + if expression_requirement is None: + expression_requirement = sum( + test is not None + for test in ( + trna_matrix_or_filepath, + mrna_matrix_or_filepath, + scrna_matrix_or_filepath, + proteomic_matrix_or_filepath, + ) + ) + logger.debug(f"Expression requirement not specified; setting to {expression_requirement}") + + output_final_model_scores_filepath.parent.mkdir(parents=True, exist_ok=True) + if output_merge_activity_filepath: + output_merge_activity_filepath.parent.mkdir(parents=True, exist_ok=True) + if output_transcriptomic_details_filepath: + output_transcriptomic_details_filepath.parent.mkdir(parents=True, exist_ok=True) + if output_trna_activity_filepath: + output_trna_activity_filepath.parent.mkdir(parents=True, exist_ok=True) + if output_mrna_activity_filepath: + output_mrna_activity_filepath.parent.mkdir(parents=True, exist_ok=True) + if output_scrna_activity_filepath: + output_scrna_activity_filepath.parent.mkdir(parents=True, exist_ok=True) + if output_proteomic_activity_filepath: + output_proteomic_activity_filepath.parent.mkdir(parents=True, exist_ok=True) + if output_figure_dirpath: + output_figure_dirpath.mkdir(parents=True, exist_ok=True) + + # Build trna items + trna_matrix: pd.DataFrame | None + trna_boolean_matrix: pd.DataFrame | None + trna_metadata: pd.DataFrame | None + trna_matrix, trna_boolean_matrix, trna_metadata = await asyncio.gather(*[ + read_file(trna_matrix_or_filepath), + read_file(trna_boolean_matrix_or_filepath), + read_file(trna_metadata_filepath_or_df), + ]) + + # Build mrna items + mrna_matrix: pd.DataFrame | None + mrna_boolean_matrix: pd.DataFrame | None + mrna_metadata: pd.DataFrame | None + mrna_matrix, mrna_boolean_matrix, mrna_metadata = await asyncio.gather(*[ + read_file(mrna_matrix_or_filepath), + read_file(mrna_boolean_matrix_or_filepath), + read_file(mrna_metadata_filepath_or_df), + ]) + + # build scrna items + + scrna_matrix: pd.DataFrame | None + scrna_boolean_matrix: pd.DataFrame | None + scrna_metadata: pd.DataFrame | None + scrna_matrix, scrna_boolean_matrix, scrna_metadata = await asyncio.gather(*[ + read_file(scrna_matrix_or_filepath), + read_file(scrna_boolean_matrix_or_filepath), + read_file(scrna_metadata_filepath_or_df), + ]) + + # build proteomic items + proteomic_matrix: pd.DataFrame | None + proteomic_boolean_matrix: pd.DataFrame | None + proteomic_metadata: pd.DataFrame | None + proteomic_matrix, proteomic_boolean_matrix, proteomic_metadata = await asyncio.gather(*[ + read_file(proteomic_matrix_or_filepath), + read_file(proteomic_boolean_matrix_or_filepath), + read_file(proteomic_metadata_filepath_or_df), + ]) + + source_weights = SourceWeights(trna=trna_weight, mrna=mrna_weight, scrna=scrna_weight, proteomics=proteomic_weight) + input_matrices = InputMatrices(trna=trna_matrix, mrna=mrna_matrix, scrna=scrna_matrix, proteomics=proteomic_matrix) + boolean_matrices = InputMatrices( + trna=trna_boolean_matrix, + mrna=mrna_boolean_matrix, + scrna=scrna_boolean_matrix, + proteomics=proteomic_boolean_matrix, + ) + output_activity_filepaths = OutputCombinedSourceFilepath( + trna=output_trna_activity_filepath, + mrna=output_mrna_activity_filepath, + scrna=output_scrna_activity_filepath, + proteomics=output_proteomic_activity_filepath, + ) + batch_names = _build_batches( + trna_metadata=trna_metadata, + mrna_metadata=mrna_metadata, + scrna_metadata=scrna_metadata, + proteomic_metadata=proteomic_metadata, + ) + + await _process( + context_name=context_name, + input_matrices=input_matrices, + boolean_matrices=boolean_matrices, + source_weights=source_weights, + batch_names=batch_names, + taxon_id=taxon_id, + minimum_source_expression=minimum_source_expression, + expression_requirement=expression_requirement, + weighted_z_floor=weighted_z_floor, + weighted_z_ceiling=weighted_z_ceiling, + adjust_method=adjust_method, + merge_zfpkm_distribution=merge_zfpkm_distribution, + force_activate_high_confidence=force_activate_high_confidence, + adjust_for_missing_sources=adjust_for_na, + output_activity_filepaths=output_activity_filepaths, + output_merge_activity_filepath=output_merge_activity_filepath, + output_transcriptomic_details_filepath=output_transcriptomic_details_filepath, + output_final_model_scores_filepath=output_final_model_scores_filepath, + output_figure_dirpath=output_figure_dirpath, + ) diff --git a/main/como/migrations.py b/main/como/migrations.py new file mode 100644 index 00000000..c1d291b1 --- /dev/null +++ b/main/como/migrations.py @@ -0,0 +1,13 @@ +import pandas as pd + + +def gene_info_migrations(df: pd.DataFrame) -> pd.DataFrame: + """Migrate gene info DataFrame to the latest version. + + Args: + df: The input DataFrame containing gene information. + + Returns: + The migrated DataFrame with updated column names. + """ + return df.rename(columns={"hgnc_symbol": "gene_symbol"}) if "hgnc_symbol" in df.columns else df diff --git a/main/como/pipelines/__init__.py b/main/como/pipelines/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/main/como/pipelines/build_condition_heatmaps.py b/main/como/pipelines/build_condition_heatmaps.py new file mode 100644 index 00000000..16b970b8 --- /dev/null +++ b/main/como/pipelines/build_condition_heatmaps.py @@ -0,0 +1,283 @@ +"""Create heatmaps of conditions vs pathway flux. + +This pipeline will generate heatmaps showing the flux through various pathways +""" + +from __future__ import annotations + +import concurrent.futures +from collections import defaultdict +from functools import partial +from pathlib import Path + +import cobra +import matplotlib.pyplot as plt +import numpy as np +import numpy.typing as npt +import pandas as pd +from pandas._libs.missing import NAType + +from como.plot.heatmap import condition_vs_pathway + + +def find_possible_model_filepaths(search_dir: Path) -> list[Path]: + """Find potential files that could be constraint-based metabolic models. + + Args: + search_dir: The directory to search for models. + + Returns: + Potential filepaths that could be loaded as a `cobra.Model` object + """ + return [f for f in search_dir.rglob("*") if f.suffix in {".mat", ".json", ".sbml", ".xml"}] + + +def get_cobra_model_if_valid(filepath: Path) -> cobra.Model | None: + """Evaluate if a given filepath can be read as a `cobra.Model`. + + Args: + filepath: The filepath to read + + Returns: + a `cobra.Model` object if the file can be read, otherwise None + """ + if filepath.suffix == ".json": + return cobra.io.load_json_model(filepath) + elif filepath.suffix == ".mat": + return cobra.io.load_matlab_model(filepath) + elif filepath.suffix in {".yaml", ".yml"}: + return cobra.io.load_yaml_model(filepath) + elif filepath.suffix in {".xml", ".sbml"}: + return cobra.io.read_sbml_model(filepath) + return None + + +def get_model_flux(model: cobra.Model, objective: str = "biomass_maintenance", solver: str = "gurobi") -> pd.Series: + """Get the flux through a CBMM. + + Args: + model: A `cobra.Model` object + objective: The objective function to optimize + solver: The solver to use + + Returns: + A pandas Series of reaction fluxes indexed by reaction ID + """ + model.objective = objective + model.solver = solver + solution = model.optimize() + return solution.fluxes + + +def get_many_model_flux( + models: list[cobra.Model], + objective: str = "biomass_maintenance", + solver: str = "gurobi", + cores: int = 4, + process_pool: concurrent.futures.ProcessPoolExecutor | None = None, + colnames: list[str] | None = None, + na_value: NAType | int | float = NAType, +) -> pd.DataFrame: + """Get the flux through many CBMMs. + + Args: + models: A list of `cobra.Model` objects + objective: The objective function to optimize + solver: The solver to use + cores: The number of CPU cores to use + process_pool: An existing process pool to use + colnames: Column names to use for the resulting dataframe + na_value: Value to use for missing values in the dataframe + + Returns: + A pandas DataFrame of reaction fluxes indexed by condition (row) and reaction ID (column) + + Raises: + ValueError: If `colnames` is provided and its length does not match the number of models + """ + if colnames and len(colnames) != len(models): + raise ValueError("Length of colnames must match length of models") + + pool = process_pool or concurrent.futures.ProcessPoolExecutor(max_workers=cores) + shutdown = not process_pool # if the user provided a pool, do not shut it down + + func = partial(get_model_flux, objective=objective, solver=solver) + series: list[pd.Series] = list(pool.map(func, models)) + for i, series_obj in enumerate(series): + series_obj.name = colnames[i] if colnames else f"model_{i}" + df: pd.DataFrame = pd.concat(list(series), axis="columns") + + if shutdown: + pool.shutdown(wait=True) + + if na_value != NAType: # no need to replace values that are already pd.NA + df = df.fillna(na_value) + + df = df.T + return df + + +def group_reactions_by_pathway(models: cobra.Model | list[cobra.Model], flux_df: pd.DataFrame) -> pd.DataFrame: + """Group reactions by their subsystem/pathway and sum the fluxes. + + Args: + models: A cobra.Model or list of cobra.Models + flux_df: A dataframe of reaction fluxes, indexed by condition and with reaction IDs as columns + + Returns: + A dataframe of pathway fluxes, indexed by condition and with pathways as columns + """ + pathways_by_reaction: dict[str, set[str]] = defaultdict(set) + models = [models] if isinstance(models, cobra.Model) else models + + for model in models: + for reaction in model.reactions: + reaction: cobra.Reaction + pathways_by_reaction[reaction.subsystem].add(reaction.id) + pathways_by_reaction.pop("", None) # remove the empty pathway; faster than checking every reaction's subsystem + + # pathway_flux: pd.DataFrame = pd.DataFrame(index=flux_df.index, columns=list(pathways_by_reaction.keys())) + # for condition in flux_df.index: + # for pathway, reactions in pathways_by_reaction.items(): + # pathway_flux.loc[condition, pathway] = flux_df.loc[condition, list(reactions)].sum() + pathway_fluxes: dict[str, pd.Series[npt.NDArray[np.floating]]] = {} + for pathway, reactions in pathways_by_reaction.items(): + reactions_in_df = list(reactions.intersection(flux_df.columns)) + if reactions_in_df: + pathway_fluxes[pathway] = flux_df[reactions_in_df].sum(axis=1) + return pd.DataFrame(pathway_fluxes) + + +def build_condition_vs_pathway_heatmap( + data: pd.DataFrame | list[cobra.Model] | Path | list[Path], + save_filepath: Path | None = None, + objective: str = "biomass_maintenance", + solver: str = "gurobi", + process_pool: concurrent.futures.ProcessPoolExecutor | None = None, + cores: int = 4, + condition_names: list[str] | None = None, + na_value: NAType | int | float = NAType, + *, + search_path: bool = False, + copy_df_when_building_plot: bool = False, + exclude_zero_flux_pathways: bool = False, +) -> plt.Figure: + """Create a heatmap of conditions vs flux through pathways. + + If `data` is a pd.DataFrame: + - The index names wile used as conditions and placed on the Y-axis + - The column names will be used as pathways and placed on the X-axis. The columns should indicate pathways. + + If `data` is a Path and `search_path` is True: + - Models will be recursively discovered under the given path + - Models will be simulated with the given objective and solver + - A dataframe will be built from the resulting series based on the above rules + + If `data` is a list of Paths: + - Models will be read and simulated for each path + - A pd.DataFrame will be built from the resulting series based on the above rules + + If `data` is a list of cobra.Models: + - Models will be simulated with the given objective and solver + - A pd.DataFrame will be built from the resulting series based on the above rules + + Args: + data: The data to use for the heatmap + search_path: Whether to search the given path for models + save_filepath: The filepath to save the heatmap to + objective: The objective function to optimize + solver: The solver to use + process_pool: An existing process pool to use + cores: The number of CPU cores to use + condition_names: Column names to use for the resulting dataframe if `data` is a Path or list of Paths + na_value: Value to use for missing values in the flux dataframe + copy_df_when_building_plot: Whether to copy the dataframe when building the plot. + This can be useful if the dataframe is going to be reused later. + exclude_zero_flux_pathways: Whether to exclude pathways that have zero flux across all conditions + + Returns: + A matplotlib Figure object containing the heatmap + + Raises: + ValueError: If `search_path` is True and `data` is not a Path + """ + if not isinstance(data, Path) and search_path: + raise ValueError("If search_path is True, data must be a Path") + + flux_df: pd.DataFrame + if isinstance(data, pd.DataFrame): + return condition_vs_pathway( + data, + save_filepath=save_filepath, + copy_df=copy_df_when_building_plot, + exclude_zero_flux_pathways=exclude_zero_flux_pathways, + ) + elif isinstance(data, list) and isinstance(data[0], cobra.Model): + models = data + flux_df = get_many_model_flux( + models=data, + objective=objective, + solver=solver, + cores=cores, + process_pool=process_pool, + colnames=condition_names, + na_value=na_value, + ) + elif isinstance(data, Path): + if search_path: + possible_model_fps: list[Path] = find_possible_model_filepaths(data) + models = [] + for fp in possible_model_fps: + if isinstance(model := get_cobra_model_if_valid(fp), cobra.Model): + models.append(model) + flux_df = get_many_model_flux( + models, + objective=objective, + solver=solver, + cores=cores, + process_pool=process_pool, + colnames=condition_names, + na_value=na_value, + ) + else: + models = get_cobra_model_if_valid(data) + flux_df = pd.DataFrame(get_model_flux(models, objective=objective, solver=solver)) + elif isinstance(data, list) and isinstance(data[0], Path): + models = [get_cobra_model_if_valid(fp) for fp in data] + flux_df = get_many_model_flux( + models, + objective=objective, + solver=solver, + cores=cores, + process_pool=process_pool, + colnames=condition_names, + na_value=na_value, + ) + + flux_df = group_reactions_by_pathway(models=models, flux_df=flux_df) + return condition_vs_pathway(data=flux_df, save_filepath=save_filepath) + + +def _main(): + models = [ + Path("/home/joshl/projects/ImmunoMetabolism/data/model_build/A/A_pDCs/A_pDCs_model_imat.json"), + Path("/home/joshl/projects/ImmunoMetabolism/data/model_build/B/B_pDCs/B_pDCs_model_imat.json"), + Path("/home/joshl/projects/ImmunoMetabolism/data/model_build/C/C_pDCs/C_pDCs_model_imat.json"), + Path("/home/joshl/projects/ImmunoMetabolism/data/model_build/D/D_pDCs/D_pDCs_model_imat.json"), + Path("/home/joshl/projects/ImmunoMetabolism/data/model_build/E/E_pDCs/E_pDCs_model_imat.json"), + ] + save_path = Path( + f"/home/joshl/projects/ImmunoMetabolism/results/figures/{models[0].stem.removeprefix('A_').removesuffix('_model_imat')}_heatmap.png" + ) + + fig = build_condition_vs_pathway_heatmap( + data=models, + cores=5, + condition_names=["Age Group A", "Age Group B", "Age Group C", "Age Group D", "Age Group E"], + save_filepath=save_path, + ) + fig.show() + + +if __name__ == "__main__": + _main() diff --git a/main/como/plot/__init__.py b/main/como/plot/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/main/como/plot/heatmap.py b/main/como/plot/heatmap.py new file mode 100644 index 00000000..0d6e4e55 --- /dev/null +++ b/main/como/plot/heatmap.py @@ -0,0 +1,103 @@ +from __future__ import annotations + +import math +from pathlib import Path + +import matplotlib.colors as mcolors +import matplotlib.pyplot as plt +import numpy as np +import pandas as pd +import seaborn as sns +from matplotlib.ticker import FixedLocator + + +def condition_vs_pathway( + data: pd.DataFrame, + save_filepath: Path | None = None, + *, + copy_df: bool = False, + exclude_zero_flux_pathways: bool = False, +) -> plt.Figure: + """Build a heatmap of fluxes through pathways across conditions. + + Args: + data: Index values are conditions, column names are pathways + save_filepath: If provided, the resulting figure will be saved to this location + copy_df: Should the incoming dataframe be copied to prevent modifications to data? + exclude_zero_flux_pathways: Should pathways that have 0 flux across all rows be excluded? + + Returns: + The resulting `matpotlib.pyplt.Figure` object + """ + plot_df: pd.DataFrame = data.copy() if copy_df else data + plot_df = plot_df.astype(np.float32) + fig = plt.figure(figsize=(100, 40), dpi=175) + + if exclude_zero_flux_pathways: + # Select pathways that have at least one non-zero value + plot_df = plot_df.loc[:, plot_df.where(plot_df != 0).any(axis=0)] + + # Identify the second largest (pos or neg) value + # This is needed in order to set the upper & lower bounds for the graph, excluding +/- 1000-flux values + plot_df[plot_df > 1000] = 1001 + plot_df[plot_df < -1000] = -1001 + second_largest_positive = plot_df[plot_df > 0].stack().drop_duplicates().nlargest(2).iloc[-1] + second_largest_negative = plot_df[plot_df < 0].stack().drop_duplicates().nsmallest(2).iloc[-1] + vmax = max(abs(second_largest_negative), second_largest_positive) + + # Convert tick marks to reasonable values: + # max tick < 100: round to 10s place + # max tick < 1_000: round to 100s place + # max tick < 10_000: round to 1_000s place + base = 10 if vmax < 100 else 100 if vmax < 10_000 else 1000 + vmax_root = math.ceil(vmax / base) * base + + # Create 5 evenly spaced ticks along the legend + ticks = np.linspace(-vmax_root, vmax_root, 5) + + # Generate legend gradient + norm = mcolors.TwoSlopeNorm(vmin=-vmax_root, vcenter=0, vmax=vmax_root) + + # If a value falls outside of `vmax_root`, set it to the following colors + cmap = plt.get_cmap("coolwarm").copy() + cmap.set_over("#660033") + cmap.set_under("#000099") + + ax: plt.Axes = sns.heatmap( + data=plot_df, + linewidths=1.0, + linecolor="#686868", + center=0, + yticklabels=True, + xticklabels=True, + norm=norm, + cmap=cmap, + cbar_kws={"extend": "both", "label": f"Flux ratio (clipped at ±{vmax:.0f})"}, + ) + + plt.title("Metabolic Model Flux Sum through Pathways", fontsize=100) + + plt.xlabel("Pathway", fontsize=80) + ax.tick_params(axis="x", which="major", labelsize=55, labelrotation=90) + + plt.ylabel("Condition", fontsize=85) + ax.tick_params(axis="y", which="major", labelsize=55, labelrotation=0) + + cbar = ax.collections[0].colorbar + cbar.set_ticks(ticks) + cbar.ax.yaxis.set_major_locator(FixedLocator(ticks)) + cbar.update_ticks() + + # Add extended triangles that are detached from the colorbar (prevents very large pos/neg values from blowing out the legend) + cbar.ax.text(0.5, 1.06, "> +1000", ha="center", va="bottom", transform=cbar.ax.transAxes, fontsize=40) + cbar.ax.text(0.5, -0.06, "< -1000", ha="center", va="top", transform=cbar.ax.transAxes, fontsize=40) + cbar.ax.tick_params(labelsize=40) + cbar.set_label("Flux", rotation=270, labelpad=40) + + fig.tight_layout(h_pad=0.85) + + if save_filepath: + save_filepath.parent.mkdir(parents=True, exist_ok=True) + fig.savefig(save_filepath, transparent=False, bbox_inches="tight") + + return fig diff --git a/main/como/plot/z_score.py b/main/como/plot/z_score.py new file mode 100644 index 00000000..0822e63a --- /dev/null +++ b/main/como/plot/z_score.py @@ -0,0 +1,47 @@ +from pathlib import Path + +import pandas as pd +import seaborn as sns +from loguru import logger +from matplotlib import pyplot as plt + + +def z_score_distribution( + df: pd.DataFrame, + title: str, + output_filepath: Path, +): + """Graph a z-score distribution. + + :param df: The z-score data to graph + :param title: Title to add to graph + :param output_filepath: Output PNG filepath location + :return: None + """ + if output_filepath.suffix not in {".png", ".pdf", ".svg"}: + logger.warning(f"Expected .png, .pdf, or .svg suffix for output_png_filepath, got {output_filepath.suffix}. Defaulting to .pdf") + output_filepath = output_filepath.with_suffix(".pdf") + logger.trace("Graphing z-score distribution") + output_filepath.parent.mkdir(parents=True, exist_ok=True) + output_filepath.unlink(missing_ok=True) + + plt.figure(figsize=(10, 6)) + + if len(df["source"].unique()) == 1: + ax = sns.histplot(df, x="zscore", bins=100, kde=True) + sns.rugplot(df, x="zscore", ax=ax) + else: + sns.histplot(df, x="zscore", hue="source", bins=100, kde=True, element="step") + plt.legend(loc="upper right", frameon=False, title=None) + + plt.title(title) + plt.xlabel("Z-score") + plt.ylabel("Frequency") + plt.xticks(fontsize=12) + plt.yticks(fontsize=12) + plt.gca().spines["top"].set_visible(False) + plt.gca().spines["right"].set_visible(False) + plt.tight_layout() + plt.savefig(output_filepath) + plt.close() + logger.success(f"Saved z-score distribution graph to '{output_filepath}'") diff --git a/main/como/project.py b/main/como/project.py new file mode 100644 index 00000000..19779940 --- /dev/null +++ b/main/como/project.py @@ -0,0 +1,116 @@ +from __future__ import annotations + +from pathlib import Path +from typing import ClassVar + +from loguru import logger + + +class SingletonMeta(type): + _instances: ClassVar[dict] = {} + + def __call__(cls, *args, **kwargs) -> SingletonMeta: + """Validate that changes to the `__init__` argument do not affect the returned instance. + + Args: + args: Positional arguments for the class constructor + kwargs: Keyword arguments for the class constructor + + Returns: + The singleton instance of the class + """ + if cls not in cls._instances: + instance = super().__call__(*args, **kwargs) + cls._instances[cls] = instance + return cls._instances[cls] + + +class Config(metaclass=SingletonMeta): + def __init__( + self, + data_dir: Path | None = None, + config_dir: Path | None = None, + result_dir: Path | None = None, + ) -> None: + """Initialize the Config object.""" + current_dir = Path.cwd() + + self.data_dir = Path(data_dir) if data_dir else None + if self.data_dir is None: + logger.warning(f"'data_dir' not provided to Config, using {Path.cwd() / 'data'}") + self.data_dir = current_dir / "data" + self.data_dir.mkdir(parents=True, exist_ok=True) + + self.config_dir = Path(config_dir) if config_dir else None + if self.config_dir is None: + logger.warning(f"'config_dir' not provided to Config, using {self.data_dir / 'config_sheets'}") + self.config_dir = self.data_dir / "config_sheets" + self.config_dir.mkdir(parents=True, exist_ok=True) + + self.result_dir = Path(result_dir) if result_dir else None + if self.result_dir is None: + logger.warning(f"'results_dir' not provided to Config, using {self.data_dir / 'results'}") + self.result_dir = self.data_dir / "results" + self.result_dir.mkdir(parents=True, exist_ok=True) + + # Additional directories + self.code_dir = current_dir / "main" / "como" + self.log_dir = self.data_dir / "logs" + self.matrix_dir = self.data_dir / "data_matrices" + self.figures_dir = self.result_dir / "figures" + + self.log_dir.mkdir(parents=True, exist_ok=True) + self.matrix_dir.mkdir(parents=True, exist_ok=True) + self.figures_dir.mkdir(parents=True, exist_ok=True) + + def update(self, **kwargs): + """Update a key in the config object. + + Args: + kwargs: keyword argumentsto set for the singleton + """ + for key, value in kwargs.items(): + if hasattr(self, key): + setattr(self, key, Path(value) if value else getattr(self, key)) + else: + logger.warning(f"{key} is not a valid attribute of Config") + + def get_context_path(self, context_name: str, create: bool = True) -> Path: + """Get path for a specific context, optionally creating it. + + Args: + context_name: Name of the context (subdirectory). + create: Whether to create the directory if it doesn't exist. + + Returns: + Full path to the context directory. + """ + path = self.result_dir / context_name + if create: + path.mkdir(parents=True, exist_ok=True) + return path + + def get_r_path(self, path: Path) -> str: + """Convert a Path object to an R-compatible path string. + + Args: + path: Path object to convert. + + Returns: + R-compatible path string. + """ + return path.as_posix() + + def get_matrix_path(self, context_name: str, filename: str) -> Path: + """Get path for a matrix file in a specific context. + + Arg: + context_name: Name of the context (subdirectory). + filename: Name of the matrix file. + + Returns: + Full path to the matrix file. + """ + path = self.matrix_dir / context_name + path.mkdir(parents=True, exist_ok=True) + return path / filename diff --git a/main/como/proteomics/Crux.py b/main/como/proteomics/Crux.py new file mode 100644 index 00000000..9351cf4e --- /dev/null +++ b/main/como/proteomics/Crux.py @@ -0,0 +1,476 @@ +# ruff: noqa + +import asyncio +import multiprocessing +import os +import re +import subprocess +from multiprocessing.sharedctypes import Synchronized +from pathlib import Path + +import numpy as np +import pandas as pd +import tqdm +from fast_bioservices import BioDBNet + +from como.proteomics.FileInformation import FileInformation, clear_print + +# TODO: Integrate crux percolator into this workflow + + +class RAWtoMZML: + def __init__(self, file_information: list[FileInformation], core_count: int) -> None: + """Convert RAW files to mzML format.""" + self.file_information: list[FileInformation] = file_information + self._core_count: int = core_count + + # These items are used to track the progress of the conversion + self._conversion_counter: Synchronized = Synchronized(multiprocessing.Value("i", 0)) + + # ----- Function Calls ----- + self.raw_to_mzml_wrapper() # Convert from raw files to mzML + + def raw_to_mzml_wrapper(self) -> None: + """Multiprocess conversion of raw files to mzML using ThermoRawFileParser.""" + # split self.file_information into self._core_count chunks + # Always round this division up to ensure all files are processed + num_chunks: int = int(np.ceil(len(self.file_information) / self._core_count)) + file_chunks: list[list[FileInformation]] = [ + self.file_information[i : i + num_chunks] for i in range(0, len(self.file_information), num_chunks) + ] + + jobs: list[multiprocessing.Process] = [] + for information in file_chunks: + # Parenthesis + comma needed to make tuple in "args" + job = multiprocessing.Process(target=self.raw_to_mzml, args=(information,)) + jobs.append(job) + + for job in jobs: + job.start() + for job in jobs: + job.join() + for job in jobs: + job.terminate() + + def raw_to_mzml(self, file_information: list[FileInformation]) -> None: + """Convert a list of raw files to mzML format.""" + for information in file_information: + self._conversion_counter.acquire() + self._conversion_counter.value += 1 + clear_print( + f"Starting raw -> mzML conversion: {self._conversion_counter.value} / {len(self.file_information)} - {information.raw_file_name}" + ) + self._conversion_counter.release() + + information.mzml_base_path.mkdir(parents=True, exist_ok=True) + subprocess.run( # noqa: S603 + [ # noqa: S607 + "thermorawfileparser", + f"--input={information.raw_file_path!s}", + f"--output_file={information.mzml_file_path!s}", + ], + stdout=subprocess.PIPE, + ) + + +class MZMLtoSQT: + def __init__( + self, + file_information: list[FileInformation], + fasta_database: Path, + core_count: int, + ) -> None: + """This file is responsible for calling the crux-toolkit utilities to process raw proteomics data + The tools here are called through the command line + + The following steps must be performed: + 1. Convert RAW files to mzML using thermorawfileparser, saving these to the default mzML output directory + 2. Analyze the mzML files using Crux Comet + 3. Save the SQT files to the default SQT output directory + """ + # These items are passed into the class + self._file_information: list[FileInformation] = file_information + self._fasta_database: Path = fasta_database + self._core_count: int = core_count + + # ----- Function Calls ----- + self.mzml_to_sqt() # Analyze mzML files, creating SQT files + + def mzml_to_sqt(self) -> None: + """This function analyzes the converted mzML files and creates SQT files + This function does not use multiprocessing, as Crux Comet incorporates its own multiprocessing + """ + for i, file_information in enumerate(self._file_information): + # Clear the previous line of output. Required if the new line is shorter than the previous line + clear_print(f"Creating SQT: {i + 1} / {len(self._file_information)} - {file_information.sqt_file_name}") + + # Call subprocess on command + # Only create the SQT file + subprocess.run( + [ + "crux", + "comet", + "--output_sqtfile", + "1", + "--output-dir", + file_information.sqt_base_path, + "--overwrite", + "T", + "--decoy_search", + "1", + "--num_threads", + str(self._core_count), + file_information.mzml_file_path, # Input mzML file + self._fasta_database, # Database to search + ], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + # Replace all "comet.*" in output directory with the name of the file being processed + comet_files = [str(file) for file in os.listdir(file_information.sqt_base_path) if str(file).startswith("comet.")] + for file_name in comet_files: + # Determine the old file path + old_file_path: Path = Path(file_information.sqt_base_path, file_name) + + # Determine the new file path + new_file_name = file_name.replace("comet", file_information.base_name) + new_file_path: Path = Path(file_information.sqt_base_path, new_file_name) + + # Rename the file + os.rename(old_file_path, new_file_path) + + clear_print("SQT creation finished") + print() + + +class SQTtoCSV: + def __init__(self, file_information: list[FileInformation], core_count: int) -> None: + """This class is meant to convert UniProt IDs to Entrez IDs using BioDBNet""" + self._biodbnet: BioDBNet = BioDBNet() + self._file_information: list[FileInformation] = file_information + self._core_count: int = min(core_count, 4) # Maximum of 4 cores + + # Merged frames contains all dataframes + # Split frames contains the S1/S2/etc dataframes, extracted from Merged frames + self._merged_frames: dict[str, pd.DataFrame] = {} + self._split_frames: dict[str, list[pd.DataFrame]] = {} + + # Max of 15 asynchronous tasks at once + # From: https://stackoverflow.com/a/48256949 + # self._semaphore = asyncio.Semaphore(50) + + self.collect_uniprot_ids_and_ion_intensity() + asyncio.run(self._convert_uniprot_wrapper()) + self.create_merged_frame() + self.new_write_data() + + def _uniprot_from_fasta_header(self, fasta_header: str, separator: str = "|") -> str: + """This function is responsible for collecting the first-index field from a pipe-separated string""" + return fasta_header.split(separator)[1] + + def collect_uniprot_ids_and_ion_intensity(self) -> None: + """This function is responsible for collecting the UniProt IDs from the input sqt files + + Documentation: https://crux.ms/file-formats/sqt-format.html + + We must perform the following + 1. Skip every line starting with an "H" + 2. Skip every line starting with an "M" + 3. Collect every line starting with an "S". This contains the ion intensity + - Collect the next line that starts with an "L". This contains the UniProt ID + 4. Repeat steps 2 and 3 until the end of the file + """ + for i, file_information in enumerate(self._file_information): + # Create a dictionary with strings as the keys and lists as the values + # uniprot_id will be a list of strings + # ion_intensity will be a list of floats + ion_intensities: list[float] = [] + fasta_headers: list[list[str]] = [] + + # Use dictionary comprehension to create data dictionary + average_intensities_dict: dict = {key: [] for key in list(file_information.intensity_df.columns)} + # average_intensities: pd.DataFrame = pd.DataFrame(columns=["uniprot", replicate_name]) # fmt: skip + + with open(file_information.sqt_file_path) as i_stream: + """ + We are going to use spectra_line_nums if the list starts with "S" + Beneath this, we are going to collect every locus ("L") that does not have a "decoy_" in a nested list + The result will be each spectra value corresponds to a list of UniProt IDs + """ + for j, line in enumerate(i_stream): + # If the line starts with an "S", collect it + if line.startswith("S"): + # If the length of ion_intensities is not equal to fasta_headers, + # We have added an intensity that did not have valid locus data + # (i.e., it only contained "decoy_") + if len(ion_intensities) != len(fasta_headers): + ion_intensities.pop() + + intensity: float = float(line.split("\t")[7]) + ion_intensities.append(intensity) + fasta_headers.append([]) + + # Get sequential lines starting with "L" and append the uniprot ID to the dataframe + elif line.startswith("L"): + fasta_header = line.split("\t")[1] + if fasta_header.startswith("decoy_"): + continue + else: + # Append fasta header to most recent list created + fasta_headers[-1].append(fasta_header) + + # Append corresponding values in ion_intensities and fasta_headers to the average_intensities list + # concat_dict: dict = {"uniprot": [], replicate_name: []} + for j in range(len(ion_intensities)): + current_intensity = ion_intensities[j] + + for k in range(len(fasta_headers[j])): + current_fasta_header = fasta_headers[j][k] + + # Get the UniProt ID from the fasta header + uniprot_id = self._uniprot_from_fasta_header(current_fasta_header) + + # Create a new row in the dataframe + average_intensities_dict["uniprot"].append(uniprot_id) + average_intensities_dict[file_information.batch].append(current_intensity) + + # Fill the "symbol" list with NA, as they are not converted yet + average_intensities_dict["symbol"] = [np.nan] * len(average_intensities_dict["uniprot"]) + + # Assign the file_information intensity dataframe to the gathered values + self._file_information[i].intensity_df = pd.DataFrame(average_intensities_dict) + self._file_information[i].intensity_df = self._file_information[i].intensity_df.groupby("uniprot", as_index=False).mean() + + async def _convert_uniprot_wrapper(self) -> None: + """This function is a multiprocessing wrapper around the convert_ids function""" + values = [self.async_convert_uniprot(self._file_information[i]) for i in range(len(self._file_information))] + + # Create a progress bar of results + # From: https://stackoverflow.com/a/61041328/ + progress_bar = tqdm.tqdm(desc="Starting UniProt to Gene Symbol conversion... ", total=len(self._file_information)) + for i, result in enumerate(asyncio.as_completed(values)): + await result # Get result from asyncio.as_completed + progress_bar.set_description(f"Working on {i + 1} of {len(self._file_information)}") + progress_bar.update() + + async def async_convert_uniprot(self, file_information: FileInformation) -> None: + chunk_size: int = 400 + num_chunks: int = np.ceil(len(file_information.intensity_df) / chunk_size) + frame_chunks: pd.DataFrame = np.array_split(file_information.intensity_df, num_chunks) + + lower_iteration: int = 0 + upper_iteration: int = 0 + + chunk: pd.DataFrame + for chunk in frame_chunks: + upper_iteration += len(chunk) + input_values: list[str] = list(chunk["uniprot"]) + + # Limit number of asynchronous calls to value defined in self._semaphore + loop = asyncio.get_event_loop() + # async with self._semaphore: + # gene_symbols: pd.DataFrame = await loop.run_in_executor(None, self._biodbnet.db2db, "UniProt Accession", "Gene Symbol", input_values) + gene_symbols: pd.DataFrame = await loop.run_in_executor( + None, + self._biodbnet.db2db, + input_values, + "UniProt Accession", + "Gene Symbol", + ) + + # The index is UniProt IDs. Create a new column of these values + gene_symbols["uniprot"] = gene_symbols.index + gene_symbols.rename(columns={"Gene Symbol": "symbol"}, inplace=True) + + # Create a new "index" column to reset the index of gene_symbols + gene_symbols["index"] = range(lower_iteration, upper_iteration) + gene_symbols.set_index("index", inplace=True, drop=True) + gene_symbols = pd.merge(gene_symbols, chunk[[file_information.batch]], left_index=True, right_index=True) + + lower_iteration += len(chunk) + file_information.intensity_df.update(gene_symbols) + + def create_merged_frame(self) -> None: + """This function is responsible for merging all dataframes of a specific cell type into a single master frame + This will allow for aggregating S1R1/S1R2/etc. dataframes into a single S1 dataframe + """ + for file in self._file_information: + cell_type = file.cell_type + if cell_type not in self._merged_frames.keys(): + self._merged_frames[cell_type] = pd.DataFrame(columns=["symbol", "uniprot"]) + + self._merged_frames[cell_type] = pd.concat([self._merged_frames[cell_type], file.intensity_df]) + + # Drop the 'uniprot' column and merge by cell type for each dataframe in master_frame + self._merged_frames[cell_type].drop(columns=["uniprot"], inplace=True) + self._merged_frames[cell_type] = self._merged_frames[cell_type].groupby("symbol").mean() + + # Create a new column "symbol" that is the index + self._merged_frames[cell_type]["symbol"] = self._merged_frames[cell_type].index + + # Reset the index to ensure the dataframe is in the correct order + self._merged_frames[cell_type].reset_index(inplace=True, drop=True) + + # Replace all nan values with 0 + self._merged_frames[cell_type].fillna(0, inplace=True) + + def split_abundance_values(self) -> None: + """This function is responsible for splitting abundance values into separate columns based on their S#R# identifier + + It will start by finding all S1R*, S2R*, etc. columns in each cell type under self._master_frames + From here, it will merge these dataframes into a new dataframe under self._split_frames, corresponding to the cell type + These split frames can then be written by self.write_data() + + Example Dataframe: + Starting + -------- + symbol,naiveB_S1R1,naiveB_S1R2,naiveB_S2R1 + A ,100 ,0 ,50 + B ,200 ,75 ,100 + C ,150 ,100 ,175 + + Ending + ------ + symbol,naiveB_S1,naiveB_S2 + A ,50 ,50 + B ,137.5 ,100 + C ,125 ,175 + """ + # Get a new line to print output on + print() + + # Copy the dictionary keys from merged_frames into the split_frames + for key in self._merged_frames.keys(): + if key not in self._split_frames.keys(): + self._split_frames[key] = [] + + # 'cell_type' is a dictionary key + for cell_type in self._merged_frames: + # Must collect the maximum S# value found in the master_frame + # ------------------- + max_iteration: int = 0 + dataframe = self._merged_frames[cell_type] + for column in dataframe.columns: + # Find the {cell_type}_S#R# column using regex + if re.match(rf"{cell_type}_S\d+R\d+", column): + # Find the S# value using regex + # iteration: re.Match | None = int(re.search(r"S(\d+)", column).group(1)) + iteration: re.Match | None = re.search(r"S(\d+)", column) + + if isinstance(iteration, re.Match): + iteration_num: int = int(iteration.group(1)) + + # Find the maximum S# value + if iteration_num > max_iteration: + max_iteration = iteration_num + # ------------------- + + # Aggregate all S# values from 1 to max_iteration + # The new dataframes will go into the self._split_frames + for i in range(1, max_iteration + 1): + # Create a new dataframe to split the S# columns from + split_frame: pd.DataFrame = dataframe.copy() + # Get the current S{i} columns in + abundance_columns: list[str] = [column for column in split_frame.columns if re.match(rf"{cell_type}_S{i}R\d+", column)] + take_columns: list[str] = ["symbol"] + abundance_columns + average_intensity_name: str = f"{cell_type}_S{i}" + + # Calculate average intensities and assign a new column + average_intensity_values = split_frame[take_columns].mean(axis=1) + # split_frame.loc[:, take_columns].mean(axis=1) + split_frame[average_intensity_name] = average_intensity_values + + # Purge the S#R## column names, they are no longer required + # We now have a new dataframe with "symbol" and "{cell_type}_S{i}" columns + # Unpack abundance_columns to create a single list + split_frame.drop(columns=abundance_columns, inplace=True) + + # If the "{cell_type}_S{i}" column is 0, remove it + split_frame = split_frame[split_frame[average_intensity_name] != 0] + split_frame.reset_index(inplace=True, drop=True) + + # Find duplicate "symbol" values and average across them + # Duplicate symbols are a result of protein isoforms mapping to the same gene + split_frame = split_frame.groupby("symbol", as_index=False).mean() + + self._split_frames[cell_type].append(split_frame) + + def new_write_data(self) -> None: + """This function is responsible for writing the dataframes found in self._merge_frames to the respective cell type file""" + # Get a list of CSV file locations + csv_file_location: dict[str, Path] = {} + for information in self._file_information: + if information.cell_type not in csv_file_location: + csv_file_location[information.cell_type] = information.intensity_csv + + # Sort columns of each cell type dataframe + for key in self._merged_frames.keys(): + # Get the "symbol" column so it can be placed at index 0 + symbol_column = self._merged_frames[key].pop("symbol") + + # Sort {cell_type}_S# columns + col_names: list[str] = list(self._merged_frames[key].columns) + self._merged_frames[key].reindex(sorted(col_names), axis=1) + + # Place the "symbol" column back in at index 0 + self._merged_frames[key].insert(0, "symbol", symbol_column) + + # Write the dataframe to its appropriate location + self._merged_frames[key].to_csv(csv_file_location[key], index=False) + + def write_data(self) -> None: + """This function creates a unique dataframe for each cell type found in the intensity dataframes + from the self._file_information list + It merges these intensity dataframes, creating a new column for each dataframe within each cell type + + The final dataframes will have the following headers: + 1. uniprot_ids + 2. gene_ids + 3. ion_intensities + + It then writes the dataframes to separate CSV files, dependent on the cell type + This function is responsible for writing a dictionary to a csv file + + The CSV will be written to the intensity_csv value within each FileInformation object + """ + # Create a dictionary containing the cell type as keys and the final dataframe as values + # This will be used to write the dataframes to separate CSV files + master_frames: dict[str, pd.DataFrame] = {} + + # Iterate through each FileInformation object + for file_information in self._file_information: + # Create a new dataframe for each cell type + if file_information.cell_type not in master_frames: + parent_directory: Path = Path(file_information.intensity_csv).parent + parent_directory.mkdir(parents=True, exist_ok=True) + + master_frames[file_information.cell_type] = pd.DataFrame(columns=file_information.base_columns) # fmt: skip + + # Update the master frame for the current cell type + # The master frame should be matched by the uniprot column + master_frames[file_information.cell_type] = pd.merge( + master_frames[file_information.cell_type], + file_information.intensity_df, + on=["uniprot", "symbol"], + how="outer", + ) + + # Once merging is complete, write each cell type to its CSV file + for cell_type in master_frames: + master_frames[cell_type].replace(np.nan, 0, inplace=True) + master_frames[cell_type].sort_values(by="symbol", inplace=True, ignore_index=True) + + csv_path = FileInformation.intensity_file_path(cell_type=cell_type) + master_frames[cell_type].to_csv(csv_path, index=False) + + @property + def file_information(self) -> list[FileInformation]: + """Reassigning the values from the incoming file_manager into a shared-memory variable means + we must provide an option to return the file_information list + """ + return self._file_information + + +if __name__ == "__main__": + print("Use the proteomics_preprocess.py file") diff --git a/main/como/proteomics/FTPManager.py b/main/como/proteomics/FTPManager.py new file mode 100644 index 00000000..b3b96052 --- /dev/null +++ b/main/como/proteomics/FTPManager.py @@ -0,0 +1,221 @@ +# ruff: noqa + +"""This file is responsible downloading data found at FTP links + +TODO: Find a way to mark a file as "downloaded" + - Keep a list of file names in a ".completd" hidden folder? +""" + +import asyncio +import multiprocessing +import time +import typing +from multiprocessing.sharedctypes import Synchronized +from urllib.parse import urlparse + +import aioftp +from loguru import logger + +from como.proteomics.FileInformation import FileInformation, clear_print +from como.utils import _log_and_raise_error +from como.data_types import LogLevel + + +async def aioftp_client(host: str, username: str = "anonymous", password: str = "guest", port: int = 21, max_attempts: int = 3) -> aioftp.Client: + """This class is responsible for creating a "client" connection""" + connection_successful: bool = False + attempt_num: int = 1 + + # Attempt to connect, throw error if unable to do so + while not connection_successful and attempt_num <= max_attempts: + try: + client: aioftp.Client = aioftp.Client() + await client.connect(host, port) + await client.login(user=username, password=password) + connection_successful = True + except ConnectionResetError: + # Make sure this print statement is on a new line on the first error + if attempt_num == 1: + print() + + # Line clean: https://stackoverflow.com/a/5419488/13885200 + clear_print(f"Attempt {attempt_num} of {max_attempts} failed to connect") + attempt_num += 1 + time.sleep(5) + if not connection_successful: + _log_and_raise_error( + "Could not connect to FTP server", + error=ConnectionResetError, + level=LogLevel.ERROR, + ) + + return client + + +class Reader: + def __init__( + self, + root_link: str, + file_extensions: list[str], + max_attempts: int = 3, + port: int = 21, + user: str = "anonymous", + passwd: str = "guest", + ) -> None: + """This class is responsible for reading data about root FTP links""" + self._root_link: str = root_link + self._extensions: list[str] = file_extensions + self._max_attempts: int = max_attempts + self._port: int = port + self._user: str = user + self._passwd: str = passwd + + self._files: list[str] = [] + self._file_sizes: list[int] = [] + + self._get_info_wrapper() + + def _get_info_wrapper(self) -> None: + event_loop = asyncio.new_event_loop() + asyncio.set_event_loop(event_loop) + async_tasks = [self._get_info()] + + event_loop.run_until_complete(asyncio.wait(async_tasks)) + event_loop.close() + + async def _get_info(self) -> None: + """This function is responsible for getting all files under the root_link""" + url_parse = urlparse(self._root_link) + + scheme: str + host: str + folder: str + if url_parse.scheme != "": + scheme = url_parse.scheme + else: + scheme = "" + if url_parse.hostname is not None: + host = url_parse.hostname + else: + _log_and_raise_error( + f"Unable to identify hostname from url: {self._root_link}", + error=ValueError, + level=LogLevel.ERROR, + ) + if url_parse.path != "": + folder = url_parse.path + else: + _log_and_raise_error( + f"Unable to identify folder or path from url: {self._root_link}", + error=ValueError, + level=LogLevel.ERROR, + ) + + client = await aioftp_client(host=host) + for path, info in await client.list(folder, recursive=True): + if str(path).endswith(tuple(self._extensions)): + download_url: str = f"{scheme}://{host}{path}" + self._files.append(download_url) + self._file_sizes.append(int(info["size"])) + + @property + def files(self) -> typing.Iterator[str]: + for file in self._files: + yield file + return self._files + + @property + def file_names(self) -> typing.Iterator[str]: + for file in self._files: + yield file.split("/")[-1] + # yield from [file.split("/")[-1] for file in self._files] + + @property + def file_sizes(self) -> typing.Iterable[int]: + for file in self._file_sizes: + yield file + # yield self._file_sizes + + +class Download: + def __init__( + self, + file_information: list[FileInformation], + core_count: int = 1, + ) -> None: + """This function is responsible for downloading items from the FTP urls passed into it""" + self._file_information: list[FileInformation] = file_information + self._core_count: int = min(core_count, 2) # Limit to 2 downloads at a time + self._download_counter: Synchronized = Synchronized(multiprocessing.Value("i", 1)) + self._semaphore = asyncio.Semaphore(self._core_count) + + # Find files to download + self._download_data_wrapper() + + def _download_data_wrapper(self) -> None: + """This function is responsible for "kicking off" asynchronous data downloading""" + print("Starting file download") + + event_loop = asyncio.new_event_loop() + asyncio.set_event_loop(event_loop) + async_tasks = [] + for file_information in self._file_information: + async_tasks.append( + self._aioftp_download_data( + file_information=file_information, + semaphore=self._semaphore, + ) + ) + + # Await all the tasks + event_loop.run_until_complete(asyncio.wait(async_tasks)) + event_loop.close() + + async def _aioftp_download_data(self, file_information: FileInformation, semaphore: asyncio.Semaphore) -> None: + url_parse = urlparse(file_information.download_url) + + scheme: str + host: str + folder: str + if url_parse.scheme != "": + scheme = url_parse.scheme + else: + scheme = "" + if url_parse.hostname is not None: + host = url_parse.hostname + else: + _log_and_raise_error( + f"Unable to identify hostname from url: {file_information.download_url}", + error=ValueError, + level=LogLevel.ERROR, + ) + if url_parse.path != "": + folder = url_parse.path + else: + _log_and_raise_error( + f"Unable to identify folder or path from url: {file_information.download_url}", + error=ValueError, + level=LogLevel.ERROR, + ) + + # Convert file size from byte to MB + size_mb: int = round(file_information.file_size / (1024**2)) + + # Use a semaphore so only N number of tasks can be started at once + async with semaphore: + client = await aioftp_client(host) + self._download_counter.acquire() + clear_print( + f"Started download {self._download_counter.value:02d} / {len(self._file_information):02d} ({size_mb} MB) - {file_information.raw_file_name}" + ) + self._download_counter.value += 1 + self._download_counter.release() + + # Download file, use "write_into" to write to a file, not a directory + await client.download(source=folder, destination=file_information.raw_file_path, write_into=True) + + await client.quit() + + +if __name__ == "__main__": + print("Use the proteomics_preprocess.py file") diff --git a/main/como/proteomics/FileInformation.py b/main/como/proteomics/FileInformation.py new file mode 100644 index 00000000..59a4a2ee --- /dev/null +++ b/main/como/proteomics/FileInformation.py @@ -0,0 +1,134 @@ +# ruff: noqa + +"""This will hold all relevant information about a single file to download + +This should be implemented as a list of objects +""" + +import os +import sys +from pathlib import Path + +import pandas as pd + +from como import project + + +def clear_print(message: str, end: str = "\033[K\r", flush: bool = True) -> None: + """Pass in your message exactly as you would like it printed, and this function will clear the screen and print it.""" + print(message, end=end, flush=flush) + + +class FileInformation: + # Create an "all_instances" variable of type list[FileInformation] + # This allows us to search through ALL instances of every FileInformation with functions declared here + # From: https://stackoverflow.com/a/17253634 + instances: list = [] + + def __init__( + self, + cell_type: str, + download_url: str | None = None, + study: int | str | None = None, + raw_path: Path | None = None, + intensity_csv: Path | None = None, + mzml_path: Path | None = None, + sqt_path: Path | None = None, + file_size: int | None = None, + ) -> None: + # File information + self.cell_type: str = cell_type + self.download_url: str = download_url + self.file_size: int = file_size + + # Must check for "None", as we are unable to do study[0] on a None object + if isinstance(study, str): + self.study: str = study + elif isinstance(study, int): + self.study: str = f"S{study}" + else: + self.study: str = "" + self.replicate: str = "" + self.batch: str = f"{study}" + + # Base file save paths + if raw_path is None: + self.raw_base_path: Path = Path(project.configs.data_dir, "results", cell_type, "proteomics", "raw") + else: + self.raw_base_path: Path = raw_path.parent + + if mzml_path is None: + self.mzml_base_path: Path = Path(project.configs.data_dir, "results", cell_type, "proteomics", "mzml") + else: + self.mzml_base_path: Path = mzml_path.parent + + if sqt_path is None: + self.sqt_base_path: Path = Path(project.configs.data_dir, "results", cell_type, "proteomics", "sqt") + else: + self.sqt_base_path: Path = sqt_path.parent + + if intensity_csv is None: + self.intensity_csv: Path = Path(project.configs.data_dir, "data_matrices", cell_type, f"protein_abundance_matrix_{cell_type}.csv") + else: + self.intensity_csv: Path = intensity_csv + + # The following variables have inital values set based only on an S# batch, not a replicate + # The set_replicate function must be called to set the values for a specific replicate, in which these variables will be reset + # File names + self.base_name: str = f"{self.cell_type}_{self.batch}_{Path(self.download_url).stem}" + self.raw_file_name: str = f"{self.base_name}.raw" + self.mzml_file_name: str = f"{self.base_name}.mzml" + self.sqt_file_name: str = f"{self.base_name}.target.sqt" + + # Full file paths + self.raw_file_path: Path = Path(self.raw_base_path, self.raw_file_name) + self.mzml_file_path: Path = Path(self.mzml_base_path, self.mzml_file_name) + self.sqt_file_path: Path = Path(self.sqt_base_path, self.sqt_file_name) + + # Intensity dataframe + self.base_columns: list[str] = ["uniprot"] + self.df_columns: list[str] = self.base_columns + [self.batch] + self.intensity_df: pd.DataFrame = pd.DataFrame(columns=self.df_columns) + + FileInformation.instances.append(self) + + def set_replicate(self, replicate: str | int): + """This function sets self.replicate, and also resets values that use the "replicate" value before it is used""" + # Set the initial replicate value + if isinstance(replicate, str): + self.replicate: str = replicate + else: + self.replicate: str = f"R{replicate}" + + # "Reset" additional values + self.batch: str = f"{self.study}{self.replicate}" + # File names + self.base_name: str = f"{self.cell_type}_{self.batch}_{Path(self.download_url).stem}" + self.raw_file_name: str = f"{self.base_name}.raw" + self.mzml_file_name: str = f"{self.base_name}.mzml" + self.sqt_file_name: str = f"{self.base_name}.target.sqt" + + # Full file paths + self.raw_file_path: Path = Path(self.raw_base_path, self.raw_file_name) + self.mzml_file_path: Path = Path(self.mzml_base_path, self.mzml_file_name) + self.sqt_file_path: Path = Path(self.sqt_base_path, self.sqt_file_name) + + # Intensity dataframe + self.base_columns: list[str] = ["uniprot"] + self.df_columns: list[str] = self.base_columns + [self.batch] + self.intensity_df: pd.DataFrame = pd.DataFrame(columns=self.df_columns) + + @classmethod + def filter_instances(cls, cell_type: str): + """This function finds all FileInformation objects that have the given cell type""" + sorted_instances: list = sorted(cls.instances, key=lambda x: x.study) + return [instance for instance in sorted_instances if instance.cell_type == cell_type] + + @staticmethod + def intensity_file_path(cell_type: str) -> Path: + """This function creates a single instance of the FileInformation class and returns the intensity_csv file location + This is useful because each cell type has a specific location all data gets written to + If all unique cell types are known, it is then possible to get their intensity csv file location + """ + information: FileInformation = FileInformation(cell_type=cell_type) + return information.intensity_csv diff --git a/main/como/proteomics/__init__.py b/main/como/proteomics/__init__.py new file mode 100644 index 00000000..d78d6d5c --- /dev/null +++ b/main/como/proteomics/__init__.py @@ -0,0 +1 @@ +from . import Crux, FileInformation, FTPManager, proteomics_preprocess diff --git a/main/como/proteomics/proteomics_preprocess.py b/main/como/proteomics/proteomics_preprocess.py new file mode 100644 index 00000000..b2223da7 --- /dev/null +++ b/main/como/proteomics/proteomics_preprocess.py @@ -0,0 +1,404 @@ +from __future__ import annotations + +import argparse +import csv +import os +from pathlib import Path + +from loguru import logger + +from como.data_types import LogLevel +from como.proteomics import Crux, FileInformation, FTPManager +from como.utils import _log_and_raise_error + + +class ArgParseFormatter(argparse.RawTextHelpFormatter, argparse.ArgumentDefaultsHelpFormatter): + """Use the RawTextHelpFormatter and the ArgumentDefaultsHelpFormatter in a single argparse parser().""" + + pass + + +class ParseCSVInput: + def __init__(self, input_csv_file: Path): + """Parse input CSV into two fields. + + 1. proteomeXchange URLs + 2. Cell Type + 3. Replicate (optional) + + This class is meant to make it easier to access each of these things + + + { + "naiveB": { + "url": [one, two, three], + "replicate": [A, B, C], + }, + "nucleophile": { + "url": [four, five, six], + "replicate": [D, E, F], + } + } + """ + self._input_csv_file: Path = input_csv_file + self._data: [str, dict[str, list[str]]] = {} + + # Get data from CSV + with self._input_csv_file.open("w") as i_stream: + reader = csv.reader(i_stream) + next(reader) + for line in reader: + if line == "" or line[0][0] == "#": # Skip 'comments' and empty lines + continue + else: + url = line[0] + cell_type = line[1] + try: + study = line[2] + except IndexError: + study = "" + + if cell_type not in self._data: + self._data[cell_type] = {"url": [], "study": []} + + self._data[cell_type]["url"].append(url) + self._data[cell_type]["study"].append(study) + + # Convert from 'old' /pride/data/archive to 'new' /pride-archive + for key in self._data: + urls = self._data[key]["url"] + for i, url in enumerate(urls): + urls[i] = url.replace("/pride/data/archive", "/pride-archive") + + @property + def ftp_urls(self) -> list[str]: + """Return a list of FTP URLs contained in the input CSV. + + Example: ftp://ftp.my_server.com + """ + master_urls: list[str] = [] + for cell_type in self._data: + urls = self._data[cell_type]["url"] + master_urls.extend(urls) + + return urls + + @property + def input_cell_types(self) -> list[str]: + """Return the cell types as defined in the input CSV file. + + TODO: Match folder paths to correlate S1R1, S1R2, etc.? + """ + cell_types: list[str] = [] + for key in self._data: + # Get the number of URLs per cell type to find the amount of cell types input + num_urls: int = len(self._data[key]["url"]) + cell_types.extend([key] * num_urls) + return cell_types + + @property + def studies(self) -> list[str]: + """Return the replicates as defined in the input CSV file.""" + master_studies: list[str] = [] + for cell_type in self._data: + replicates = self._data[cell_type]["study"] + master_studies.extend(replicates) + return master_studies + + @property + def csv_dict(self) -> dict[str, dict[str, list[str]]]: + """Return the CSV information as a dictionary. + + It contains data in the following format + { + CELL_TYPE_1: { + "url": ["url_one", "url_two", 'url_three', "url_four"], + "replicate": ["S1R1", "S1R2", "S2R1", "S3R1"] + }, + CELL_TYPE_2: { + "url": ["url_five", "url_six", 'url_seven', "url_eight"], + "replicate": ["S1R1", "S1R2", "S2R1", "S2R2"] + } + } + """ + return self._data + + +class PopulateInformation: + def __init__( + self, + file_information: list[FileInformation], + csv_data: ParseCSVInput, + skip_download: bool, + preferred_extensions: list[str] | None = None, + ): + """Populate FileInformation list with data from the input CSV file.""" + self.file_information: list[FileInformation] = file_information + self._csv_data: ParseCSVInput = csv_data + self._csv_data: dict[str, dict[str, list[str]]] = csv_data.csv_dict + self._skip_download: bool = skip_download + + # Set default value for extensions to search for + self._preferred_extensions: list[str] = preferred_extensions + if self._preferred_extensions is None: + self._preferred_extensions = ["raw"] + + self._gather_data() + self._set_replicate_numbers() + + if self._skip_download is False: + self.print_download_size() + + def _gather_data(self): + # Iterate through the cell type and corresponding list of URLS + # cell_type: naiveB + # ftp_urls: ["url_1", "url_2"] + for cell_type in self._csv_data: + ftp_urls: list[str] = self._csv_data[cell_type]["url"] + studies: list[str] = self._csv_data[cell_type]["study"] + url_count = 0 + + # Iterate through the URLs available + for url, study in zip(ftp_urls, studies, strict=True): + ftp_data: FTPManager.Reader = FTPManager.Reader(root_link=url, file_extensions=self._preferred_extensions) + + urls = list(ftp_data.files) + sizes = list(ftp_data.file_sizes) + url_count += len(urls) + + # Iterate through all files and sizes found for url_## + for file, size in zip(urls, sizes, strict=True): + self.file_information.append(FileInformation(cell_type=cell_type, download_url=file, file_size=size, study=study)) + + def print_download_size(self): + """Print the total size to download if we must download data.""" + total_size: int = 0 + for information in self.file_information: + total_size += information.file_size + + # Convert to MB + total_size = total_size // 1024**2 + logger.info(f"Total size to download: {total_size} MB") + + def _set_replicate_numbers(self): + instances: dict[str, list[FileInformation]] = {} + for information in self.file_information: + if information.cell_type not in instances: + instances[information.cell_type] = FileInformation.filter_instances(information.cell_type) + + for cell_type in instances: + replicate_num: int = 1 + for i, file_information in enumerate(instances[cell_type]): + current_info: FileInformation = file_information + previous_info: FileInformation = instances[cell_type][i - 1] if i > 0 else None + + # Do not modify the replicate value if we are on the very first iteration of this cell type + if i == 0: + pass + # If the current_info cell type and study match the previous, increment the replicate_num by one + elif current_info.cell_type == previous_info.cell_type and current_info.study == previous_info.study: + replicate_num += 1 + else: + replicate_num: int = 1 + + replicate_value: str = f"R{replicate_num}" + current_info.set_replicate(replicate_value) + + def _collect_cell_type_information(self, cell_type: str) -> list[FileInformation]: + """Collect all FileInformation objects of a given cell type. + + Arg: + cell_type: The cell type to collect information for. + + Returns: + A list of FileInformation objects matching the given cell type. + """ + return [information for information in self.file_information if information.cell_type == cell_type] + + +def parse_args() -> argparse.Namespace: + """Parse arguments from the command line. + + Returns: + An argparse.Namespace object containing the parsed arguments. + """ + parser = argparse.ArgumentParser( + prog="proteomics_preprocess.py", + description="Download and analyze proteomics data from proteomeXchange\n" + "Comments can be added to the csv file by starting a line with a '#'\n" + "The input file should be formatted as the following example:\n" + "\n" + "url,cell_type,study\n" + "# This is a comment\n" + "ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD026140,naiveB,S1\n" + "ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD017564,m0Macro,S1\n" + "ftp://ftp.pride.ebi.ac.uk/pride/data/archive/2022/02/PXD017987,naiveB,S2\n", + epilog="For additional help, please post questions/issues in the MADRID GitHub repo at " + "https://github.com/HelikarLab/MADRID or email babessell@gmail.com", + formatter_class=ArgParseFormatter, + ) + parser.add_argument( + "-i", + "--input", + required=True, + dest="input_csv", + metavar="/home/USER/data/proteomics_urls.csv", + help="The proteomeXchange CSV file location", + ) + parser.add_argument( + "-d", + "--database", + required=True, + dest="database", + metavar="/home/USER/data/database_file.fasta", + help="The fasta database to search for protein identification", + ) + parser.add_argument( + "-e", + "--extensions", + dest="extensions", + required=False, + default="raw", + help="A list of file extensions to download from the FTP server", + metavar="raw,txt,another_extension", + ) + parser.add_argument( + "--skip-download", + required=False, + dest="skip_download", + default=False, + type=bool, + help="If this action is passed in, FTP data will not be downloaded. " + "This assumes you have raw data under the folder specified by the option '--ftp-out-dir'", + ) + parser.add_argument( + "--skip-mzml", + required=False, + dest="skip_mzml", + type=bool, + default=False, + help="If this action is passed in, files will not be converted from RAW to mzML format. " + "This assumes you have mzML files under the folder specified by the option '--mzml-out-dir'. " + "This will continue the workflow from SQT file creation -> CSV ion intensity creation. " + "If this option is passed in, FTP data will also not be downloaded.", + ) + parser.add_argument( + "--skip-sqt", + required=False, + dest="skip_sqt", + type=bool, + default=False, + help="If this action is passed in, SQT files will not be created. " + "This assumes you have SQT files under the folder specified by the option '--sqt-out-dir'. " + "This will only read data from SQT files and create a CSV ion intensity file. " + "If this option is passed in, FTP data will not be downloaded, " + "RAW files will not be converted, and SQT files will not be created.", + ) + parser.add_argument( + "-c", + "--cores", + required=False, + dest="core_count", + metavar="cores", + default=os.cpu_count() // 2, + help="This is the number of threads to use for downloading files. " + "It will default to the minimum of: half the available CPU cores available, " + "or the number of input files found. " + "It will not use more cores than necessary. " + "Options are an integer or 'all' to use all available cores. " + "Note: Downloading will use a MAX of 2 threads at once, " + "as some FTP servers do not work well with multiple connections from the same IP address at once.", + ) + # TODO: Add option to delete intermediate files (raw, mzml, sqt) + + args: argparse.Namespace = parser.parse_args() + args.extensions = args.extensions.split(",") + + # Validte the input file exists + if not Path(args.input_csv).is_file(): + _log_and_raise_error(f"Input file {args.input} does not exist!", error=FileNotFoundError, level=LogLevel.ERROR) + + if args.core_count == "all": + args.core_count = os.cpu_count() + elif not str(args.core_count).isdigit(): + _log_and_raise_error( + f"Invalid option '{args.core_count}' for option '--cores'. Enter an integer or 'all' to use all cores", + error=ValueError, + level=LogLevel.ERROR, + ) + + else: + args.core_count = int(args.core_count) + if args.core_count > os.cpu_count(): + logger.info(f"{args.core_count} cores not available, system only has {os.cpu_count()} cores. Setting '--cores' to {os.cpu_count()}") + args.core_count = os.cpu_count() + + return args + + +def _main(): + file_information: list[FileInformation] = [] + args: argparse.Namespace = parse_args() + csv_data = ParseCSVInput(args.input_csv) + + """ + This comment is for the logic surrounding "skipping" a step in the workflow + 1. skip_download (download FTP data - FTPManager) + 2. skip_mzml_conversion (Convert raw to mzML - Crux) + 3. skip_sqt_creation (Convert mzML to SQT - Crux) + + If args.skip_sqt is True, do not perform steps 1, 2, or 3 + If args.skip_mzml is True, do not perform step 1 or 2 + If args.skip_download is True, do not perform step 1 + + Ultimately, this results in if-statements that look like: + if __ is False: + do_tasks + Because we are performing tasks if the "skip" is False + """ + if args.skip_sqt: + args.skip_mzml = True + args.skip_download = True + elif args.skip_mzml: + args.skip_download = True + + # Populate the file_information list + PopulateInformation( + file_information=file_information, + csv_data=csv_data, + skip_download=args.skip_download, + preferred_extensions=args.extensions, + ) + + # Download data if we should not skip anything + if args.skip_download is False: + # Start the download of FTP data + FTPManager.Download( + file_information=file_information, + core_count=args.core_count, + ) + + if args.skip_mzml is False: + # Convert raw to mzML and then create SQT files + Crux.RAWtoMZML( + file_information=file_information, + core_count=args.core_count, + ) + + if args.skip_sqt is False: + # Convert mzML to SQT + Crux.MZMLtoSQT( + file_information=file_information, + fasta_database=args.database, + core_count=args.core_count, + ) + + # Create CSV file from SQT files + Crux.SQTtoCSV( + file_information=file_information, + core_count=args.core_count, + ) + + +if __name__ == "__main__": + _main() diff --git a/main/como/proteomics_gen.py b/main/como/proteomics_gen.py new file mode 100644 index 00000000..33bcc995 --- /dev/null +++ b/main/como/proteomics_gen.py @@ -0,0 +1,269 @@ +from __future__ import annotations + +import sys +from io import TextIOWrapper +from pathlib import Path + +import numpy as np +import pandas as pd +from fast_bioservices.biodbnet import BioDBNet, Input, Output +from loguru import logger + +from como.data_types import LOG_FORMAT, LogLevel +from como.project import Config +from como.proteomics_preprocessing import protein_transform_main +from como.utils import _log_and_raise_error, return_placeholder_data, set_up_logging + + +# Load Proteomics +def process_proteomics_data(path: Path) -> pd.DataFrame: + """Load proteomics data from a given context and filename. + + Args: + path: Path to the proteomics data file (CSV format). + + Returns: + pd.DataFrame: Processed proteomics data with 'gene_symbol' column exploded. + """ + # Preprocess data, drop na, duplicate ';' in symbol, + matrix: pd.DataFrame = pd.read_csv(path) + if "gene_symbol" not in matrix.columns: + _log_and_raise_error( + "No gene_symbol column found in proteomics data.", + error=ValueError, + level=LogLevel.ERROR, + ) + + matrix["gene_symbol"] = matrix["gene_symbol"].astype(str) + matrix.dropna(subset=["gene_symbol"], inplace=True) + matrix = matrix.assign(gene_symbol=matrix["gene_symbol"].str.split(";")).explode("gene_symbol") + return matrix + + +# read map to convert to entrez +async def load_gene_symbol_map(gene_symbols: list[str], entrez_map: Path | None = None): + """Load a mapping from gene symbols to Entrez IDs. + + Args: + gene_symbols (list[str]): List of gene symbols to map. + entrez_map (Path | None): Optional path to a CSV file containing precomputed mappings. + + Returns: + pd.DataFrame: DataFrame with gene symbols as index and corresponding Entrez IDs. + """ + if entrez_map and entrez_map.exists(): + df = pd.read_csv(entrez_map, index_col="gene_symbol") + else: + biodbnet = BioDBNet() + df = await biodbnet.async_db2db( + values=gene_symbols, + input_db=Input.GENE_SYMBOL, + output_db=[Output.GENE_ID, Output.ENSEMBL_GENE_ID], + ) + df.loc[df["gene_id"].isna(), ["gene_id"]] = np.nan + df.to_csv(entrez_map, index_label="gene_symbol") + + return df[~df.index.duplicated()] + + +def abundance_to_bool_group( + context_name, + abundance_filepath: Path, + output_gaussian_png_filepath: Path, + output_gaussian_html_filepath: Path, + output_z_score_matrix_filepath: Path, + abundance_matrix: pd.DataFrame, + replicate_ratio: float, + high_confidence_replicate_ratio: float, + quantile: float, + output_boolean_filepath: Path, +): + """Convert proteomic data to boolean expression.""" + abundance_matrix.to_csv(abundance_filepath, index_label="entrez_gene_id") + protein_transform_main( + abundance_df=abundance_matrix, + output_gaussian_png_filepath=output_gaussian_png_filepath, + output_gaussian_html_filepath=output_gaussian_html_filepath, + output_z_score_matrix_filepath=output_z_score_matrix_filepath, + ) + + # Logical Calculation + abundance_matrix_nozero = abundance_matrix.replace(0, np.nan) + thresholds = abundance_matrix_nozero.quantile(quantile, axis=0) + testbool = pd.DataFrame(0, columns=abundance_matrix.columns, index=abundance_matrix.index) + + for col in abundance_matrix.columns: + testbool.loc[abundance_matrix[col] > thresholds[col], [col]] = 1 + + abundance_matrix["expressed"] = 0 + abundance_matrix["high"] = 0 + abundance_matrix["pos"] = abundance_matrix[abundance_matrix > 0].sum(axis=1) / abundance_matrix.count(axis=1) + abundance_matrix.loc[(abundance_matrix["pos"] >= replicate_ratio), ["expressed"]] = 1 + abundance_matrix.loc[(abundance_matrix["pos"] >= high_confidence_replicate_ratio), ["high"]] = 1 + + abundance_matrix.to_csv(output_boolean_filepath, index_label="entrez_gene_id") + + +def to_bool_context(context_name, group_ratio, hi_group_ratio, group_names): + """Convert proteomic data to boolean expression.""" + config = Config() + output_dir = config.result_dir / context_name / "proteomics" + merged_df = pd.DataFrame(columns=["entrez_gene_id", "expressed", "high"]) + merged_df.set_index(["entrez_gene_id"], inplace=True) + merged_hi_df = merged_df + + for group in group_names: + read_filepath = output_dir / f"bool_prot_Matrix_{context_name}_{group}.csv" + read_df = pd.read_csv(read_filepath) + read_df.set_index("entrez_gene_id", inplace=True) + read_df = read_df[["expressed", "high"]] + + if not merged_df.empty: + merged_df = pd.merge(merged_df, read_df["expressed"], right_index=True, left_index=True) + merged_hi_df = pd.merge(merged_hi_df, read_df["high"], right_index=True, left_index=True) + + else: + merged_df = read_df["expressed"].to_frame() + merged_hi_df = read_df["high"].to_frame() + + if len(merged_df.columns) > 1: + merged_df.apply(lambda x: sum(x) / len(merged_df.columns) >= group_ratio, axis=1, result_type="reduce") + merged_hi_df.apply(lambda x: sum(x) / len(merged_hi_df.columns) >= hi_group_ratio, axis=1, result_type="reduce") + + out_df = pd.merge(merged_df, merged_hi_df, right_index=True, left_index=True) + out_filepath = output_dir / f"Proteomics_{context_name}.csv" + out_df.to_csv(out_filepath, index_label="entrez_gene_id") + logger.success(f"Test Data Saved to {out_filepath}") + + +# read data from csv files +def load_proteomics_tests(filename, context_name) -> tuple[str, pd.DataFrame]: + """Load statistical test results. + + Arg: + filename (str): The name of the file to load. + context_name (str): The context name for the data. + + Returns: + tuple: A tuple containing the context name and the loaded data as a pandas DataFrame. + """ + config = Config() + + def load_empty_dict(): + return "dummy", return_placeholder_data() + + if not filename or filename == "None": # if not using proteomics load empty dummy data matrix + return load_empty_dict() + + inquiry_full_path = config.data_dir / "config_sheets" / filename + if not inquiry_full_path.exists(): + _log_and_raise_error(f"Error: file not found {inquiry_full_path}", error=FileNotFoundError, level=LogLevel.ERROR) + + filename = f"Proteomics_{context_name}.csv" + full_save_filepath = config.result_dir / context_name / "proteomics" / filename + if full_save_filepath.exists(): + data = pd.read_csv(full_save_filepath, index_col="entrez_gene_id") + logger.success(f"Read from {full_save_filepath}") + return context_name, data + + else: + logger.warning(f"Proteomics gene expression file for {context_name} was not found at {full_save_filepath}. Is this intentional?") + return load_empty_dict() + + +async def proteomics_gen( + context_name: str, + config_filepath: Path, + matrix_filepath: Path, + output_boolean_filepath: Path, + output_z_score_matrix_filepath: Path, + output_gaussian_png_filepath: Path | None = None, + output_gaussian_html_filepath: Path | None = None, + input_entrez_map: Path | None = None, + replicate_ratio: float = 0.5, + batch_ratio: float = 0.5, + high_confidence_replicate_ratio: float = 0.7, + high_confidence_batch_ratio: float = 0.7, + quantile: int = 25, + log_level: LogLevel = LogLevel.INFO, + log_location: str | TextIOWrapper = sys.stderr, +): + """Generate proteomics data.""" + set_up_logging(level=log_level, location=log_location) + + if not config_filepath.exists(): + _log_and_raise_error( + f"Config file not found at {config_filepath}", + error=FileNotFoundError, + level=LogLevel.ERROR, + ) + if config_filepath.suffix not in (".xlsx", ".xls"): + _log_and_raise_error( + f"Config file must be an xlsx or xls file at {config_filepath}", + error=FileNotFoundError, + level=LogLevel.ERROR, + ) + + if not matrix_filepath.exists(): + _log_and_raise_error( + f"Matrix file not found at {matrix_filepath}", + error=FileNotFoundError, + level=LogLevel.ERROR, + ) + if matrix_filepath.suffix != ".csv": + _log_and_raise_error( + f"Matrix file must be a csv file at {matrix_filepath}", + error=FileNotFoundError, + level=LogLevel.ERROR, + ) + + if quantile < 0 or quantile > 100: + _log_and_raise_error( + "Quantile must be an integer from 0 to 100", + error=ValueError, + level=LogLevel.ERROR, + ) + quantile /= 100 + + config_df = pd.read_excel(config_filepath, sheet_name=context_name) + matrix: pd.DataFrame = process_proteomics_data(matrix_filepath) + + groups = config_df["group"].unique().tolist() + + for group in groups: + indices = np.where([g == group for g in config_df["group"]]) + sample_columns = [*np.take(config_df["sample_name"].to_numpy(), indices).ravel().tolist(), "gene_symbol"] + matrix = matrix.loc[:, sample_columns] + + symbols_to_gene_ids = await load_gene_symbol_map( + gene_symbols=matrix["gene_symbol"].tolist(), + entrez_map=input_entrez_map, + ) + matrix.dropna(subset=["gene_symbol"], inplace=True) + if "uniprot" in matrix.columns: + matrix.drop(columns=["uniprot"], inplace=True) + + matrix = matrix.groupby(["gene_symbol"]).agg("max") + matrix["entrez_gene_id"] = symbols_to_gene_ids["gene_id"] + matrix.dropna(subset=["entrez_gene_id"], inplace=True) + matrix.set_index("entrez_gene_id", inplace=True) + + # bool_filepath = output_dir / f"bool_prot_Matrix_{context_name}_{group_name}.csv" + abundance_to_bool_group( + context_name=context_name, + abundance_filepath=matrix_filepath, + abundance_matrix=matrix, + replicate_ratio=replicate_ratio, + high_confidence_replicate_ratio=high_confidence_replicate_ratio, + quantile=quantile, + output_boolean_filepath=output_boolean_filepath, + output_gaussian_png_filepath=output_gaussian_png_filepath, + output_gaussian_html_filepath=output_gaussian_html_filepath, + output_z_score_matrix_filepath=output_z_score_matrix_filepath, + ) + to_bool_context( + context_name=context_name, + group_ratio=batch_ratio, + hi_group_ratio=high_confidence_batch_ratio, + group_names=groups, + ) diff --git a/main/como/proteomics_preprocessing.py b/main/como/proteomics_preprocessing.py new file mode 100644 index 00000000..39d4c85c --- /dev/null +++ b/main/como/proteomics_preprocessing.py @@ -0,0 +1,174 @@ +from __future__ import annotations + +import colorsys +import random +from dataclasses import dataclass +from pathlib import Path + +import numpy as np +import numpy.typing as npt +import pandas as pd +import plotly.graph_objs as go +from loguru import logger +from plotly.subplots import make_subplots +from scipy import stats +from scipy.signal import find_peaks +from sklearn.neighbors import KernelDensity + + +@dataclass +class ZResult: + """Dataclass to store results of Z-score calculation.""" + + zfpkm: pd.DataFrame + x_range: pd.DataFrame # npt.NDArray[np.float64] + density: pd.DataFrame # npt.NDArray[np.float64] + mu: npt.NDArray[np.float64] + std_dev: npt.NDArray[np.float64] + max_fpkm_peak: npt.NDArray[np.float64] + + +def z_score_calc(abundance: pd.DataFrame, min_thresh: int) -> ZResult: + """Calculate Z-scores for protein abundance data. + + Arg: + abundance: DataFrame with protein abundance data. + min_thresh: Minimum threshold for abundance values to be considered. + + Returns: + A ZResult object containing Z-score transformed data and related statistics. + """ + values = abundance.values.copy() + 1 + log_abundance_filt = np.log2(values[values > min_thresh]).reshape((len(abundance), len(abundance.columns))) + log_abundance = np.log2(values) + + # np.zeros((1000, len(abundance.columns)), dtype=np.float64), + z_result = ZResult( + zfpkm=pd.DataFrame(data=np.nan * np.ones_like(values), index=abundance.index, columns=abundance.columns, dtype=np.float64), + x_range=pd.DataFrame(data=np.zeros((1000, len(abundance.columns))), columns=abundance.columns, dtype=np.float64), + density=pd.DataFrame(data=np.zeros((1000, len(abundance.columns))), columns=abundance.columns, dtype=np.float64), + mu=np.zeros(len(abundance.columns)), + std_dev=np.zeros(len(abundance.columns)), + max_fpkm_peak=np.zeros(len(abundance.columns)), + ) + + for i, col in enumerate(abundance.columns): + kde = KernelDensity(kernel="gaussian", bandwidth=0.5).fit(log_abundance_filt[:, i].reshape(-1, 1)) + x_range = np.linspace(log_abundance[:, i].min(), log_abundance[:, i].max(), 1000) + density = np.exp(kde.score_samples(x_range.reshape(-1, 1))) # type: ignore + peaks, _ = find_peaks(density, height=0.02, distance=1.0) + peak_positions = x_range[peaks] + + mu = peak_positions.max() + max_fpkm_peak = density[peaks[np.argmax(peak_positions)]] # type: ignore + + # Select rows from `log_abundance` that are greater than 0 and less than mu in the column `i` + u = log_abundance[:, i][(log_abundance[:, i] > 0) & (log_abundance[:, i] < mu)].mean() + std_dev = (mu - u) * np.sqrt(np.pi / 2) + zfpkm_values = (log_abundance[:, i] - mu) / std_dev + + z_result.zfpkm[col] = zfpkm_values + z_result.x_range[col] = x_range + z_result.density[col] = density + z_result.mu[i] = mu + z_result.std_dev[i] = std_dev + z_result.max_fpkm_peak[i] = max_fpkm_peak + + return z_result + + +def lighten_color(red: int, green: int, blue: int, factor: float = 0.5) -> str: + """Lighten a color by a given factor. + + Args: + red: Red component (0-255). + green: Green component (0-255). + blue: Blue component (0-255). + factor: Factor by which to lighten the color (0.0 to 1.0). + + Returns: + A string representing the lightened color in RGB format. + """ + # Convert RGB to HLS + hue, lightness, saturation = colorsys.rgb_to_hls(red / 255.0, green / 255.0, blue / 255.0) + + # Increase lightness + lightness = min(1.0, lightness + (1 - lightness) * factor) + + # Covnert back to RGB values + r, g, b = colorsys.hsv_to_rgb(hue, saturation, lightness) + return f"rgb({int(r * 255)},{int(g * 255)},{int(b * 255)})" + + +# Plotting function +def plot_gaussian_fit(z_results: ZResult, facet_titles: bool = True, x_min: int = -4) -> go.Figure: + """Plot Gaussian fit for Z-score transformed data. + + Arg: + z_results: The results from the Z-score calculation. + facet_titles: Whether to show titles for each facet. + x_min: Minimum x-axis value for the plots. + + Returns: + A Plotly Figure object containing the Gaussian fit plots. + + """ + zfpkm = z_results.zfpkm + x_range = z_results.x_range + density = z_results.density + mu = z_results.mu + std_dev = z_results.std_dev + max_fpkm = z_results.max_fpkm_peak + + fig = make_subplots(rows=len(zfpkm.columns), cols=1, subplot_titles=zfpkm.columns if facet_titles else [None] * len(zfpkm.columns)) + for i, col in enumerate(zfpkm.columns): + fitted = stats.norm.pdf(x_range[col], loc=mu[i], scale=std_dev[i]) + scale_fit = fitted * (max_fpkm[i] / fitted.max()) + + # Select random RGB values for each plot + r = random.randint(0, 255) # noqa: S311, not for cryptographic purposes + g = random.randint(0, 255) # noqa: S311, not for cryptographic purposes + b = random.randint(0, 255) # noqa: S311, not for cryptographic purposes + color, lighten = f"rgb({r},{g},{b})", lighten_color(r, g, b) + fig.add_trace( + go.Scatter(x=x_range[col], y=density[col], name="Abundance Density", line={"color": color}), + row=i + 1, + col=1, + ) + fig.add_trace( + go.Scatter(x=x_range[col], y=scale_fit, name="Fitted Gaussian", line={"dash": "dash", "color": lighten}), + row=i + 1, + col=1, + ) + + fig.update_xaxes(title_text="log2(abundance)", range=[x_min, x_range[col].max()], row=i + 1, col=1) + fig.update_yaxes(title_text="[scaled] density", row=i + 1, col=1) + + fig.update_layout(height=len(zfpkm.columns) * 250, width=800, title_text="Gaussian Fit per Sample") + return fig + + +# Main function for protein abundance transformation +def protein_transform_main( + abundance_df: pd.DataFrame | str | Path, + output_gaussian_png_filepath: Path, + output_gaussian_html_filepath: Path, + output_z_score_matrix_filepath: Path, +) -> None: + """Transform protein abundance data.""" + abundance_df: pd.DataFrame = pd.read_csv(abundance_df) if isinstance(abundance_df, (str, Path)) else abundance_df.fillna(0) + abundance_df = abundance_df[np.isfinite(abundance_df).all(axis=1)] # Remove +/- infinity values + z_transform: ZResult = z_score_calc(abundance_df, min_thresh=0) + + fig = plot_gaussian_fit(z_results=z_transform, facet_titles=True, x_min=-4) + + if output_gaussian_png_filepath: + fig.write_image(output_gaussian_png_filepath.with_suffix(".png")) + logger.info(f"PNG gaussian figure written to {output_gaussian_png_filepath}") + if output_gaussian_html_filepath: + fig.write_html(output_gaussian_html_filepath.with_suffix(".html")) + logger.info(f"Interactive HTML gaussian figure written to {output_gaussian_png_filepath}") + + z_transformed_abundances = z_transform.zfpkm + z_transformed_abundances[abundance_df == 0] = -4 + z_transformed_abundances.to_csv(output_z_score_matrix_filepath, index=False) diff --git a/main/como/rnaseq_gen.py b/main/como/rnaseq_gen.py new file mode 100644 index 00000000..ed6274ee --- /dev/null +++ b/main/como/rnaseq_gen.py @@ -0,0 +1,1040 @@ +from __future__ import annotations + +import multiprocessing +import sys +import time +from collections import namedtuple +from concurrent.futures import Future, ProcessPoolExecutor, as_completed +from dataclasses import dataclass, field +from enum import Enum +from io import TextIOWrapper +from pathlib import Path +from typing import NamedTuple + +import matplotlib.pyplot as plt +import numpy as np +import numpy.typing as npt +import pandas as pd +import scanpy as sc +import seaborn as sns +import sklearn +import sklearn.neighbors +from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol, gene_symbol_to_ensembl_and_gene_id +from loguru import logger +from pandas import DataFrame +from scipy.signal import find_peaks +from sklearn.neighbors import KernelDensity + +from como.data_types import FilteringTechnique, LogLevel, PeakIdentificationParameters, RNAType +from como.migrations import gene_info_migrations +from como.project import Config +from como.utils import _log_and_raise_error, num_columns, read_file, set_up_logging + + +class _FilteringOptions(NamedTuple): + replicate_ratio: float + batch_ratio: float + cut_off: float + high_replicate_ratio: float + high_batch_ratio: float + + +class LayoutMethod(Enum): + """RNA sequencing layout method.""" + + paired_end = "paired-end" + single_end = "single-end" + + +@dataclass(slots=True) +class _StudyMetrics: + study: str + num_samples: int + count_matrix: pd.DataFrame + fragment_lengths: npt.NDArray[np.float32] + sample_names: list[str] + layout: list[LayoutMethod] + entrez_gene_ids: list[int] + gene_sizes: npt.NDArray[np.float32] + __normalization_matrix: pd.DataFrame = field(default_factory=pd.DataFrame) + __z_score_matrix: pd.DataFrame = field(default_factory=pd.DataFrame) + __high_confidence_entrez_gene_ids: list[str] = field(default=list) # type: ignore + + def __post_init__(self): + for layout in self.layout: + if layout not in LayoutMethod: + _log_and_raise_error( + f"Layout must be 'paired-end' or 'single-end'; got: {layout}", + error=ValueError, + level=LogLevel.ERROR, + ) + + @property + def normalization_matrix(self) -> pd.DataFrame: + return self.__normalization_matrix + + @normalization_matrix.setter + def normalization_matrix(self, value: pd.DataFrame) -> None: + self.__normalization_matrix = value + + @property + def z_score_matrix(self) -> pd.DataFrame: + return self.__z_score_matrix + + @z_score_matrix.setter + def z_score_matrix(self, value: pd.DataFrame) -> None: + self.__z_score_matrix = value + + @property + def high_confidence_entrez_gene_ids(self) -> list[str]: + return self.__high_confidence_entrez_gene_ids + + @high_confidence_entrez_gene_ids.setter + def high_confidence_entrez_gene_ids(self, values: list[str]) -> None: + self.__high_confidence_entrez_gene_ids = values + + +class _ZFPKMResult(NamedTuple): + zfpkm: pd.Series + density: Density + mu: float + std_dev: float + max_fpkm: float + + +class _CountMetrics(NamedTuple): + metrics: dict[str, _StudyMetrics] + entrez_gene_ids: list[int] + + +Density = namedtuple("Density", ["x", "y"]) +NamedMetrics = dict[str, _StudyMetrics] + + +def k_over_a(data: pd.DataFrame | npt.NDArray, k: int, a: float) -> npt.NDArray: + """Return a boolean mask of rows where at least k entries are >= a. + + This code is based on the `kOverA` function found in R's `genefilter` package: https://www.rdocumentation.org/packages/genefilter/versions/1.54.2/topics/kOverA + + Args: + data: The data to filter, either a Pandas DataFrame or a NumPy array. + k: The minimum number of times the sum of elements must be greater than or equal to A. + a: The value to compare the sum of elements to. + + Returns: + A NumPy array of booleans indicating which rows pass the filter. + """ + arr = data.to_numpy() if isinstance(data, pd.DataFrame) else np.asarray(data) + counts = np.sum(arr >= a, axis=1) + return counts >= k + + +async def _build_count_metrics( + *, + matrix: pd.DataFrame, + gene_info: pd.DataFrame, + metadata_df: pd.DataFrame, + taxon: int, +) -> _CountMetrics: + """Read the counts matrix and returns the results. + + Arg: + matrix: The gene counts matrix to process + metadata_df: The configuration dataframe related to the current context + taxon: The NCBI Taxon ID + + Returns: + A dataclass `ReadMatrixResults` + + Raises: + ValueError: If no columns to merge on are found, or if no sample columns are found after merging with gene information. + """ + matrix.dropna(subset="ensembl_gene_id", inplace=True) + conversion = await ensembl_to_gene_id_and_symbol(ids=matrix["ensembl_gene_id"].tolist(), taxon=taxon) + conversion["ensembl_gene_id"] = conversion["ensembl_gene_id"].str.split(",") + conversion = conversion.explode("ensembl_gene_id") + conversion.reset_index(inplace=True, drop=True) + + merge_on = [] + if "ensembl_gene_id" in matrix.columns and "ensembl_gene_id" in conversion.columns: + merge_on.append("ensembl_gene_id") + if "entrez_gene_id" in matrix.columns and "entrez_gene_id" in conversion.columns: + merge_on.append("entrez_gene_id") + if "gene_symbol" in matrix.columns and "gene_symbol" in conversion.columns: + merge_on.append("gene_symbol") + + if not merge_on: + _log_and_raise_error( + "No columns to merge on. Tested 'ensembl_gene_id', 'entrez_gene_id', and 'gene_symbol'. Please check your input files.", + error=ValueError, + level=LogLevel.ERROR, + ) + matrix = matrix.merge(conversion, on=merge_on, how="left") + + # Only include Entrez and Ensembl Gene IDs that are present in `gene_info` + matrix["entrez_gene_id"] = matrix["entrez_gene_id"].str.split("//") + matrix = matrix.explode("entrez_gene_id").replace(to_replace="-", value=pd.NA).dropna(subset="entrez_gene_id") + matrix["entrez_gene_id"] = matrix["entrez_gene_id"].astype(int) + + gene_info = gene_info_migrations(gene_info) + # gene_info = gene_info.replace(to_replace="-", value=pd.NA) + gene_info["entrez_gene_id"] = gene_info["entrez_gene_id"].astype(int) + + counts_matrix = matrix.merge( + gene_info[["entrez_gene_id", "ensembl_gene_id"]], + on=["entrez_gene_id", "ensembl_gene_id"], + how="inner", + ) + # Did not combine gene count information, show error + if len(counts_matrix.columns) == 3: + raise ValueError( + "No sample columns found in the counts matrix after merging with gene information. " + "Please ensure that your counts matrix contains sample columns in addition to gene identifiers." + ) + + gene_info = gene_info.merge( + counts_matrix[["entrez_gene_id", "ensembl_gene_id"]], + on=["entrez_gene_id", "ensembl_gene_id"], + how="inner", + ) + + entrez_gene_ids: list[str] = gene_info["entrez_gene_id"].tolist() + metrics: NamedMetrics = {} + for study in metadata_df["study"].unique(): + study_sample_names = metadata_df[metadata_df["study"] == study]["sample_name"].tolist() + layouts = metadata_df[metadata_df["study"] == study]["layout"].tolist() + metrics[study] = _StudyMetrics( + count_matrix=counts_matrix[counts_matrix.columns.intersection(study_sample_names)], + fragment_lengths=metadata_df[metadata_df["study"] == study]["fragment_length"].values, + sample_names=study_sample_names, + layout=[LayoutMethod(layout) for layout in layouts], + num_samples=len(study_sample_names), + entrez_gene_ids=entrez_gene_ids, + gene_sizes=np.array(gene_info["size"].values).astype(np.float32), + study=study, + ) + metrics[study].fragment_lengths[np.isnan(metrics[study].fragment_lengths)] = 0 + metrics[study].count_matrix.index = pd.Index(entrez_gene_ids, name="entrez_gene_id") + + return _CountMetrics(metrics=metrics, entrez_gene_ids=gene_info["entrez_gene_id"].astype(int).tolist()) + + +def calculate_tpm(metrics: NamedMetrics) -> NamedMetrics: + """Calculate the Transcripts Per Million (TPM) for each sample in the metrics dictionary. + + Args: + metrics: A dictionary of study metrics to calculate TPM for. + + Returns: + A dictionary of study metrics with TPM calculated. + """ + for sample in metrics: + count_matrix = metrics[sample].count_matrix + + gene_sizes = pd.Series(metrics[sample].gene_sizes, index=count_matrix.index) + adjusted_counts = count_matrix.add(1e-6) + + tpm_matrix = adjusted_counts.divide(gene_sizes, axis=0) # (count + 1) / gene_length + tpm_matrix = tpm_matrix.div(tpm_matrix.sum(axis=0), axis=1) # normalize by total + tpm_matrix = tpm_matrix.mul(1e6) # scale to per-million + metrics[sample].normalization_matrix = tpm_matrix + + return metrics + + +def _calculate_fpkm(metrics: NamedMetrics, scale: float = 1e6) -> NamedMetrics: + """Calculate the Fragments Per Kilobase of transcript per Million mapped reads (FPKM) for each sample in the metrics dictionary. + + Args: + metrics: A dictionary of study metrics to calculate FPKM for. + scale: The scaling factor for normalization (default is 1e6). + + Returns: + A dictionary of study metrics with FPKM calculated. + """ + for study in metrics: + matrix_values = [] + + for sample in range(metrics[study].num_samples): + layout = metrics[study].layout[sample] + count_matrix: npt.NDArray[np.float32] = metrics[study].count_matrix.iloc[:, sample].values + gene_lengths = ( + metrics[study].fragment_lengths[sample].astype(np.float32) + if layout == LayoutMethod.paired_end + else metrics[study].gene_sizes.astype(np.float32) + ) + gene_lengths_kb = gene_lengths / 1000.0 + + match layout: + case LayoutMethod.paired_end: # FPKM + total_fragments = count_matrix.sum(axis=0) + if total_fragments == 0: + fragments_per_kilobase_million = np.nan + else: + counts_per_million = total_fragments / scale + fragments_per_kilobase = count_matrix / gene_lengths_kb + fragments_per_kilobase_million = fragments_per_kilobase / counts_per_million + matrix_values.append(fragments_per_kilobase_million) + case LayoutMethod.single_end: # RPKM + reads_per_kilobase = count_matrix / gene_lengths_kb + total_reads = count_matrix.sum(axis=0) + counts_per_million = total_reads / scale + reads_per_kilobase_million = reads_per_kilobase / counts_per_million + matrix_values.append(reads_per_kilobase_million) + case _: + _log_and_raise_error( + ( + f"Invalid normalization method specified ''. " + f"Must be one of '{LayoutMethod.paired_end.value}' or '{LayoutMethod.single_end.value}'." + ), + error=ValueError, + level=LogLevel.ERROR, + ) + + # Transpose is needed because values were appended as rows + fpkm_matrix = pd.DataFrame(matrix_values).T + fpkm_matrix.index = metrics[study].count_matrix.index + fpkm_matrix.columns = metrics[study].sample_names + + fpkm_matrix = fpkm_matrix[~pd.isna(fpkm_matrix)] + metrics[study].normalization_matrix = fpkm_matrix + metrics[study].normalization_matrix.columns = metrics[study].count_matrix.columns + + return metrics + + +def _zfpkm_calculation( + column: pd.Series, + peak_parameters: PeakIdentificationParameters, + bandwidth: int = 0.5, +) -> _ZFPKMResult: + """Log2 Transformations. + + Stabilize the variance in the data to make the distribution more symmetric; this is helpful for Gaussian fitting + + Kernel Density Estimation (kde) + - SciKit Learn: https://scikit-learn.org/stable/modules/density.html + - Non-parametric method to estimate the probability density function (PDF) of a random variable + - Estimates the distribution of log2-transformed FPKM values + - Bandwidth parameter controls the smoothness of the density estimate + - KDE Explanation + - A way to smooth a histogram to get a better idea of the underlying distribution of the data + - Given a set of data points, we want to understand how they are distributed. + Histograms can be useful, but are sensitive to bin size and number + - The KDE places a "kernel" - a small symmetric function (i.e., Gaussian curve) - at each data point + - The "kernel" acts as a weight, giving more weight to points closer to the center of the kernel, + and less weight to points farther away + - Kernel functions are summed along each point on the x-axis + - A smooth curve is created that represents the estimated density of the data + + Peak Finding + - Identifies that are above a certain height and separated by a minimum distance + - Represent potential local maxima in the distribution + + Peak Selection + - The peak with the highest x-value (from log2-FPKM) is chosen as the mean (mu) + of the "inactive" gene distribution + - The peak representing unexpressed or inactive genes should be at a lower FPKM + value compared to the peak representing expressed genes + + Standard Deviation Estimation + - The mean of log2-FPKM values are greater than the calculated mu + - Standard deviation is estimated based on the assumption that the right tail of the distribution + This represents expressed genes) can be approximated by a half-normal distribution + + zFPKM Transformation + - Centers disbribution around 0 and scales it by the standard deviation. + This makes it easier to compare gene expression across different samples + - Represents the number of standard deviations away from the mean of the "inactive" gene distribution + - Higher zFPKM values indicate higher expression levels relative to the "inactive" peak + - A zFPKM value of 0 represents the mean of the "inactive" distribution + - Research shows that a zFPKM value of -3 or greater can be used as + a threshold for calling a gene as "active" and/or "expressed" + : https://doi.org/10.1186/1471-2164-14-778 + + Args: + column: A Pandas Series containing FPKM values for a single sample. + peak_parameters: Parameters for peak identification in zFPKM calculation. + bandwidth: The bandwidth for kernel density estimation in zFPKM calculation. + + Returns: + A named tuple containing: + - zfpkm: A Pandas Series of zFPKM values for the input sample. + - density: A named tuple containing the x and y values of the KDE. + - mu: The mean of the "inactive" gene distribution. + - std_dev: The estimated standard deviation of the "inactive" gene distribution. + - max_fpkm: The maximum FPKM value at the identified peak. + """ + values: npt.NDArray = column.values + # replace na values with 0 + values = np.nan_to_num(values, nan=0.0) + refit: KernelDensity = KernelDensity(kernel="gaussian", bandwidth=bandwidth) + refit.fit(values.reshape(-1, 1)) + + x_range = np.linspace(values.min(), values.max(), 2000) + density = np.exp(refit.score_samples(x_range.reshape(-1, 1))) + peaks, _ = find_peaks(density, height=peak_parameters.height, distance=peak_parameters.distance) + peak_positions = x_range[peaks] + + mu = 0 + max_fpkm = 0 + stddev = 1 + + if len(peaks) != 0: + mu = peak_positions.max() + max_fpkm = density[peaks[np.argmax(peak_positions)]] + u = values[values > mu].mean() + stddev = (u - mu) * np.sqrt(np.pi / 2) + zfpkm = pd.Series((values - mu) / stddev, dtype=np.float32, name=column.name) + + return _ZFPKMResult(zfpkm=zfpkm, density=Density(x_range, density), mu=mu, std_dev=stddev, max_fpkm=max_fpkm) + + +def zfpkm_transform( + fpkm_df: pd.DataFrame, + peak_parameters: PeakIdentificationParameters, + bandwidth: int, + update_every_percent: float = 0.1, +) -> tuple[dict[str, _ZFPKMResult], DataFrame]: + """Perform zFPKM calculation/transformation. + + Args: + fpkm_df: A DataFrame containing FPKM values with genes as rows and samples as columns. + peak_parameters: Parameters for peak identification in zFPKM calculation. + bandwidth: The bandwidth for kernel density estimation in zFPKM calculation. + update_every_percent: Frequency of progress updates as a decimal between 0 and 1 (e.g., 0.1 for every 10%). + + Returns: + A tuple containing: + - A dictionary of intermediate results for each sample. + - A DataFrame of zFPKM values with the same shape as the input fpkm_df. + """ + if update_every_percent > 1: + logger.warning(f"update_every_percent should be a decimal value between 0 and 1; got: {update_every_percent} - will convert to percentage") + update_every_percent /= 100 + + total_samples = num_columns(fpkm_df) + update_per_step: int = int(np.ceil(total_samples * update_every_percent)) + + # Get at least 1 core and at most cpu_count() - 2 + cores = max(min(multiprocessing.cpu_count() - 2, total_samples), 1) + logger.debug(f"zFPKM transforming {len(fpkm_df.columns)} sample(s) containing {len(fpkm_df):,} genes(s) using {cores} core(s)") + logger.debug(f"Will update every {update_per_step:,} steps (~{update_every_percent:.1%} of {total_samples:,})") + + chunk_time = time.time() + start_time = time.time() + log_padding = len(str(f"{total_samples:,}")) + zfpkm_series: list[pd.Series] = [] + results: dict[str, _ZFPKMResult] = {} + + with ProcessPoolExecutor(max_workers=cores) as pool: + futures: list[Future[_ZFPKMResult]] = [ + pool.submit( + _zfpkm_calculation, + column=fpkm_df[column], + peak_parameters=peak_parameters, + bandwidth=bandwidth, + ) + for column in fpkm_df + ] + for i, future in enumerate(as_completed(futures)): + result = future.result() + key = str(result.zfpkm.name) + results[key] = result + zfpkm_series.append(result.zfpkm) + + if i != 0 and ((i + 1) % update_per_step == 0 or (i + 1) == total_samples): + current_time = time.time() + chunk = current_time - chunk_time + total_time = current_time - start_time + chunk_num = f"{i + 1:,}" + logger.debug( + f"Processed {chunk_num:>{log_padding}} of {total_samples:,} - " + f"chunk took {chunk:.1f} seconds - " + f"running for {total_time:.1f} seconds" + ) + chunk_time = current_time + + zfpkm_df = pd.DataFrame({series.name: series for series in zfpkm_series}, index=fpkm_df.index) + return results, zfpkm_df + + +def zfpkm_plot(results, *, output_png_filepath: Path, plot_xfloor: int = -4): + """Plot the log2(FPKM) density and fitted Gaussian for each sample. + + :param results: A dictionary of intermediate results from zfpkm_transform. + :param output_png_filepath: Output filepath location + :param: subplot_titles: Whether to display facet titles (sample names). + :param plot_xfloor: Lower limit for the x-axis. + :param subplot_titles: Whether to display facet titles (sample names). + """ + to_concat: list[pd.DataFrame] = [None] * len(results) # type: ignore # ignoring because None is not of type pd.DataFrame + for name, result in results.items(): + stddev = result.std_dev + x = np.array(result.density.x) + y = np.array(result.density.y) + + fitted = np.exp(-0.5 * ((x - result.mu) / stddev) ** 2) / (stddev * np.sqrt(2 * np.pi)) + max_fpkm = y.max() + max_fitted = fitted.max() + scale_fitted = fitted * (max_fpkm / max_fitted) + + to_concat.append( + pd.DataFrame( + { + "sample_name": [name] * len(x), + "log2fpkm": x, + "fpkm_density": y, + "fitted_density_scaled": scale_fitted, + } + ) + ) + mega_df = pd.concat(to_concat, ignore_index=True) + mega_df.columns = pd.Series(data=["sample_name", "log2fpkm", "fpkm_density", "fitted_density_scaled"]) + mega_df = mega_df.melt(id_vars=["log2fpkm", "sample_name"], var_name="source", value_name="density") + + fig: plt.Figure + axes: plt.Axes + fig, axes = plt.subplots(nrows=len(results), ncols=1, figsize=(8, 4 * len(results))) + if len(results) == 1: + axes: list[plt.Axes] = [axes] + + for i, sample_name in enumerate(results): + sample_data = mega_df[mega_df["sample_name"] == sample_name] + axis = axes[i] + + for source_type in sample_data["source"].unique(): + group = sample_data[sample_data["source"] == source_type] + sns.lineplot(data=group, x="log2fpkm", y="density", label=source_type, ax=axis) + + axis.set_xlim(plot_xfloor, sample_data["log2fpkm"].max()) + axis.set_xlabel("log2(FPKM)") + axis.set_ylabel("density [scaled]") + axis.legend(title="Source") + + fig.tight_layout() + if output_png_filepath.suffix != ".png": + logger.warning(f"Output filepath did not end in '.png', setting to '.png' now. Got: '{output_png_filepath.suffix}'") + output_png_filepath = output_png_filepath.with_suffix(".png") + fig.savefig(output_png_filepath) + + +def calculate_z_score(metrics: NamedMetrics) -> NamedMetrics: + """Calculate the z-score for each sample in the metrics dictionary. + + Args: + metrics: A dictionary of study metrics to calculate z-scores for. + + Returns: + A dictionary of study metrics with z-scores calculated. + """ + for sample in metrics: + log_matrix = np.log(metrics[sample].normalization_matrix) + z_matrix = pd.DataFrame(data=sklearn.preprocessing.scale(log_matrix, axis=1), columns=metrics[sample].sample_names) + metrics[sample].z_score_matrix = z_matrix + return metrics + + +def cpm_filter( + *, + context_name: str, + metrics: NamedMetrics, + filtering_options: _FilteringOptions, + prep: RNAType, +) -> NamedMetrics: + """Apply Counts Per Million (CPM) filtering to the count matrix for a given sample. + + Args: + context_name: The name of the context being processed. + metrics: A dictionary of study metrics to filter. + filtering_options: Options for filtering the count matrix. + prep: The RNA preparation type. + + Returns: + A dictionary of filtered study metrics. + """ + config = Config() + n_exp = filtering_options.replicate_ratio + n_top = filtering_options.high_replicate_ratio + cut_off = filtering_options.cut_off + + sample: str + metric: _StudyMetrics + for sample, metric in metrics.items(): + counts: pd.DataFrame = metric.count_matrix + entrez_ids: list[str] = metric.entrez_gene_ids + library_size: pd.DataFrame = counts.sum(axis=1) + + # For library_sizes equal to 0, add 1 to prevent divide by 0 + # This will not impact the final counts per million calculation because the original counts are still 0 + # thus, (0 / 1) * 1_000_000 = 0 + library_size[library_size == 0] = 1 + + output_filepath = config.result_dir / context_name / prep.value / f"CPM_Matrix_{prep.value}_{sample}.csv" + output_filepath.parent.mkdir(parents=True, exist_ok=True) + counts_per_million: pd.DataFrame = (counts / library_size) * 1_000_000 + counts_per_million.insert(0, "entrez_gene_ids", pd.Series(entrez_ids)) + logger.debug(f"Writing CPM matrix to {output_filepath}") + counts_per_million.dropna(inplace=True) + counts_per_million.to_csv(output_filepath, index=False) + + # TODO: Counts per million is adding ~61,500 columns (equal to the number of genes) for some reason. + # Most likely due to multiplying by 1_000_000, not exactly sure why + + min_samples = round(n_exp * len(counts.columns)) # noqa: F841 + top_samples = round(n_top * len(counts.columns)) # noqa: F841 + test_bools = pd.DataFrame({"entrez_gene_ids": entrez_ids}) + for i in range(len(counts_per_million.columns)): + median_sum = np.float64(np.median(np.sum(counts[:, i]))) + if cut_off == "default": # noqa: SIM108 + cutoff = np.float64(10e6) / median_sum + else: + cutoff = np.float64(1e6 * cut_off) / median_sum + test_bools = test_bools.merge(counts_per_million[counts_per_million.iloc[:, i] > cutoff]) + + return metrics + + +def tpm_quantile_filter(*, metrics: NamedMetrics, filtering_options: _FilteringOptions) -> NamedMetrics: + """Apply quantile-based filtering to the TPM matrix for a given sample. + + Args: + metrics: A dictionary of study metrics to filter. + filtering_options: Options for filtering the count matrix. + + Returns: + A dictionary of filtered study metrics. + """ + # TODO: Write the TPM matrix to disk + + n_exp = filtering_options.replicate_ratio + n_top = filtering_options.high_replicate_ratio + cut_off = filtering_options.cut_off + metrics = calculate_tpm(metrics) + + sample: str + metric: _StudyMetrics + for sample, metric in metrics.items(): + entrez_ids = metric.entrez_gene_ids + gene_size = metric.gene_sizes + tpm_matrix: pd.DataFrame = metric.normalization_matrix + + min_samples = round(n_exp * tpm_matrix.shape[1]) + top_samples = round(n_top * tpm_matrix.shape[1]) + + tpm_quantile = tpm_matrix[tpm_matrix > 0] + quantile_cutoff = np.quantile(a=tpm_quantile.values, q=1 - (cut_off / 100), axis=0) # Compute quantile across columns + boolean_expression = pd.DataFrame(data=(tpm_matrix > quantile_cutoff), index=tpm_matrix.index, columns=tpm_matrix.columns, dtype=int) + + min_genes: npt.NDArray[bool] = k_over_a(boolean_expression, min_samples, 0.9) + top_genes: npt.NDArray[bool] = k_over_a(boolean_expression, top_samples, 0.9) + + # Only keep `entrez_gene_ids` that pass `min_genes` + metric.entrez_gene_ids = boolean_expression.index[min_genes].tolist() + metric.gene_sizes = gene_size[min_genes] + metric.count_matrix = metric.count_matrix.loc[min_genes, :] + metric.normalization_matrix = metrics[sample].normalization_matrix.loc[min_genes, :] + + metric.high_confidence_entrez_gene_ids = entrez_ids[top_genes].tolist() + + metrics = calculate_z_score(metrics) + + return metrics + + +def zfpkm_filter( + *, + metrics: NamedMetrics, + filtering_options: _FilteringOptions, + calculate_fpkm: bool, + force_zfpkm_plot: bool, + peak_parameters: PeakIdentificationParameters, + bandwidth: int, + output_png_filepath: Path | None, +) -> NamedMetrics: + """Apply zFPKM filtering to the FPKM matrix for a given sample. + + Args: + metrics: A dictionary of study metrics to filter. + filtering_options: Options for filtering the count matrix. + calculate_fpkm: Whether to calculate FPKM from counts. + force_zfpkm_plot: Whether to force plotting of zFPKM results even if there are many samples. + peak_parameters: Parameters for peak identification in zFPKM calculation. + bandwidth: The bandwidth for kernel density estimation in zFPKM calculation. + output_png_filepath: Optional filepath to save the zFPKM plot. + + Returns: + A dictionary of filtered study metrics. + """ + min_sample_expression = filtering_options.replicate_ratio + high_confidence_sample_expression = filtering_options.high_replicate_ratio + cut_off = filtering_options.cut_off + metrics = _calculate_fpkm(metrics) if calculate_fpkm else metrics + + for metric in metrics.values(): + metric: _StudyMetrics + # if fpkm was not calculated, the normalization matrix will be empty; collect the count matrix instead + matrix = metric.count_matrix if metric.normalization_matrix.empty else metric.normalization_matrix + matrix = matrix[matrix.sum(axis=1) > 0] # remove rows (genes) that have no counts across all samples + + results, zfpkm_df = zfpkm_transform(matrix, peak_parameters=peak_parameters, bandwidth=bandwidth) + zfpkm_df[(matrix == 0) | (zfpkm_df.isna())] = -4 + + if len(results) > 10 and not force_zfpkm_plot: + logger.warning( + "Not plotting zFPKM results because more than 10 plots would be created. " + "If you would like to plot them anyway, set 'force_zfpkm_plot' to True" + ) + elif output_png_filepath is None: + logger.critical("Output zFPKM PNG filepath is None, set a path to plot zFPKM graphs") + else: + output_png_filepath.parent.mkdir(parents=True, exist_ok=True) + output_png_filepath.unlink(missing_ok=True) + zfpkm_plot(results, output_png_filepath=output_png_filepath) + + metric.z_score_matrix = zfpkm_df + + # determine which genes are expressed + min_samples = round(min_sample_expression * zfpkm_df.shape[1]) # [1] is the number of columns + mask = k_over_a(zfpkm_df, min_samples, cut_off) + metric.entrez_gene_ids = zfpkm_df.index[mask].tolist() + + # determine which genes are confidently expressed + top_samples = round(high_confidence_sample_expression * zfpkm_df.shape[1]) # [1] is the number of columns + top_mask = k_over_a(zfpkm_df, top_samples, cut_off) + metric.high_confidence_entrez_gene_ids = zfpkm_df.index[top_mask].tolist() + + return metrics + + +def filter_counts( + *, + context_name: str, + metrics: NamedMetrics, + technique: FilteringTechnique, + filtering_options: _FilteringOptions, + prep: RNAType, + force_zfpkm_plot: bool, + peak_parameters: PeakIdentificationParameters, + bandwidth: int, + output_png_filepath: Path | None = None, +) -> NamedMetrics: + """Filter the count matrix based on the specified technique. + + Args: + context_name: The name of the context being processed. + metrics: A dictionary of study metrics to filter. + technique: The filtering technique to use. + filtering_options: Options for filtering the count matrix. + prep: The RNA preparation type. + force_zfpkm_plot: Whether to force plotting of zFPKM results even if there are many samples. + peak_parameters: Parameters for peak identification in zFPKM calculation. + bandwidth: The bandwidth for kernel density estimation in zFPKM calculation. + output_png_filepath: Optional filepath to save the zFPKM plot. + + Returns: + A dictionary of filtered study metrics. + """ + match technique: + case FilteringTechnique.CPM: + return cpm_filter(context_name=context_name, metrics=metrics, filtering_options=filtering_options, prep=prep) + case FilteringTechnique.TPM: + return tpm_quantile_filter(metrics=metrics, filtering_options=filtering_options) + case FilteringTechnique.ZFPKM: + return zfpkm_filter( + metrics=metrics, + filtering_options=filtering_options, + calculate_fpkm=True, + force_zfpkm_plot=force_zfpkm_plot, + peak_parameters=peak_parameters, + bandwidth=bandwidth, + output_png_filepath=output_png_filepath, + ) + case FilteringTechnique.UMI: + # UMI filtering is the same as zFPKM filtering without calculating FPKM + return zfpkm_filter( + metrics=metrics, + filtering_options=filtering_options, + calculate_fpkm=False, + force_zfpkm_plot=force_zfpkm_plot, + peak_parameters=peak_parameters, + bandwidth=bandwidth, + output_png_filepath=output_png_filepath, + ) + case _: + _log_and_raise_error( + f"Technique must be one of {FilteringTechnique}, got '{technique.value}'", + error=ValueError, + level=LogLevel.ERROR, + ) + + +async def _process( + context_name: str, + rnaseq_matrix_filepath: Path, + metadata_df: pd.DataFrame, + gene_info_df: pd.DataFrame, + prep: RNAType, + taxon: int, + replicate_ratio: float, + batch_ratio: float, + high_replicate_ratio: float, + high_batch_ratio: float, + technique: FilteringTechnique, + cut_off: int | float, + force_zfpkm_plot: bool, + peak_parameters: PeakIdentificationParameters, + bandwidth: int, + output_boolean_activity_filepath: Path, + output_zscore_normalization_filepath: Path, + output_zfpkm_png_filepath: Path | None, +): + """Save the results of the RNA-Seq tests to a CSV file.""" + output_boolean_activity_filepath.parent.mkdir(parents=True, exist_ok=True) + + rnaseq_matrix: pd.DataFrame | sc.AnnData = await read_file(rnaseq_matrix_filepath) + if isinstance(rnaseq_matrix, pd.DataFrame): + rnaseq_matrix: pd.DataFrame + elif isinstance(rnaseq_matrix, sc.AnnData): + rnaseq_matrix: sc.AnnData + + if isinstance(rnaseq_matrix, sc.AnnData): + rnaseq_matrix: sc.AnnData + conversion = await gene_symbol_to_ensembl_and_gene_id(symbols=rnaseq_matrix.var_names.tolist(), taxon=taxon) + conversion.reset_index(inplace=True, drop=False) + rnaseq_matrix = rnaseq_matrix.merge(conversion, how="left", on="gene_symbol") + # rnaseq_matrix = rnaseq_matrix.replace(to_replace=pd.NA, value="-") + + filtering_options = _FilteringOptions( + replicate_ratio=replicate_ratio, + batch_ratio=batch_ratio, + cut_off=float(cut_off), + high_replicate_ratio=high_replicate_ratio, + high_batch_ratio=high_batch_ratio, + ) + + count_metrics: _CountMetrics = await _build_count_metrics(matrix=rnaseq_matrix, gene_info=gene_info_df, metadata_df=metadata_df, taxon=taxon) + metrics: NamedMetrics = count_metrics.metrics + entrez_gene_ids: list[int] = count_metrics.entrez_gene_ids + + metrics = filter_counts( + context_name=context_name, + metrics=metrics, + technique=technique, + filtering_options=filtering_options, + prep=prep, + force_zfpkm_plot=force_zfpkm_plot, + peak_parameters=peak_parameters, + bandwidth=bandwidth, + output_png_filepath=output_zfpkm_png_filepath, + ) + + merged_zscore_df = pd.DataFrame() + expressed_genes: list[str] = [] + top_genes: list[str] = [] + for metric in metrics.values(): + expressed_genes.extend(metric.entrez_gene_ids) + top_genes.extend(metric.high_confidence_entrez_gene_ids) + + merged_zscore_df = ( + metric.z_score_matrix + if merged_zscore_df.empty + else merged_zscore_df.merge( + metric.z_score_matrix, + how="outer", + left_index=True, + right_index=True, + ) + ) + merged_zscore_df[merged_zscore_df.isna()] = -4 + + # If any of the normalization metrics are not empty, write the normalized metrics to disk + if not all(metric.normalization_matrix.empty for metric in metrics.values()): + merged_zscore_df.to_csv(output_zscore_normalization_filepath, index=True) + logger.success(f"Wrote z-score normalization matrix to {output_zscore_normalization_filepath}") + else: + logger.warning( + "Not writing z-score normalization matrix because no normalization matrices exist. This is expected if you are using UMI filtering." + ) + + expression_frequency = pd.Series(expressed_genes).value_counts() + expression_df = pd.DataFrame({"entrez_gene_id": expression_frequency.index, "frequency": expression_frequency.values}) + expression_df["prop"] = expression_df["frequency"] / len(metrics) + expression_df = expression_df[expression_df["prop"] >= filtering_options.batch_ratio] + + top_frequency = pd.Series(top_genes).value_counts() + top_df = pd.DataFrame({"entrez_gene_id": top_frequency.index, "frequency": top_frequency.values}) + top_df["prop"] = top_df["frequency"] / len(metrics) + top_df = top_df[top_df["prop"] >= filtering_options.high_batch_ratio] + + boolean_matrix = pd.DataFrame(data={"entrez_gene_id": entrez_gene_ids, "expressed": 0, "high": 0}) + for gene in entrez_gene_ids: + if gene in expression_df["entrez_gene_id"]: + boolean_matrix.loc[gene, "expressed"] = 1 + if gene in top_df["entrez_gene_id"]: + boolean_matrix.loc[gene, "high"] = 1 + + expressed_count = len(boolean_matrix[boolean_matrix["expressed"] == 1]) + high_confidence_count = len(boolean_matrix[boolean_matrix["high"] == 1]) + + boolean_matrix.dropna(subset="entrez_gene_id", inplace=True) + boolean_matrix.to_csv(output_boolean_activity_filepath, index=False) + logger.info(f"{context_name} - Found {expressed_count} expressed genes, {high_confidence_count} of which are confidently expressed") + logger.success(f"Wrote boolean matrix to {output_boolean_activity_filepath}") + + +async def rnaseq_gen( # noqa: C901 + context_name: str, + input_rnaseq_filepath: Path, + input_gene_info_filepath: Path, + prep: RNAType, + taxon_id: int, + output_boolean_activity_filepath: Path, + output_zscore_normalization_filepath: Path, + input_metadata_filepath_or_df: Path | pd.DataFrame, + replicate_ratio: float = 0.5, + high_replicate_ratio: float = 1.0, + batch_ratio: float = 0.5, + high_batch_ratio: float = 1.0, + technique: FilteringTechnique | str = FilteringTechnique.ZFPKM, + zfpkm_peak_height: float = 0.02, + zfpkm_peak_distance: float = 1.0, + zfpkm_bandwidth: int = 1, + cutoff: int | float | None = None, + force_zfpkm_plot: bool = False, + log_level: LogLevel = LogLevel.INFO, + log_location: str | TextIOWrapper = sys.stderr, + output_zfpkm_png_filepath: Path | None = None, +) -> None: + """Generate a list of active and high-confidence genes from a gene count matrix. + + Replicates are compared for consensus within the study/batch number according to replicate ratios, + then study/batch numbers are checked for consensus according to batch ratios. + The zFPKM method is outlined here: https://pubmed.ncbi.nlm.nih.gov/24215113/ + + :param context_name: The name of the context being processed + :param input_rnaseq_filepath: The filepath to the gene count matrix + :param input_gene_info_filepath: The filepath to the gene info file + :param output_boolean_activity_filepath: The filepath to write the output gene count matrix + :param output_zscore_normalization_filepath: The filepath to write the output z-score normalization matrix + :param prep: The preparation method + :param taxon_id: The NCBI Taxon ID + :param input_metadata_filepath_or_df: The filepath or dataframe containing metadata information + :param replicate_ratio: The percentage of replicates that a gene must + appear in for a gene to be marked as "active" in a batch/study + :param batch_ratio: The percentage of batches that a gene must appear in for a gene to be marked as 'active" + :param high_replicate_ratio: The percentage of replicates that a gene must + appear in for a gene to be marked "highly confident" in its expression in a batch/study + :param high_batch_ratio: The percentage of batches that a gene must + appear in for a gene to be marked "highly confident" in its expression + :param technique: The filtering technique to use + :param zfpkm_peak_height: The height of the zFPKM peak + :param zfpkm_peak_distance: The distance of the zFPKM peak + :param zfpkm_bandwidth: The bandwidth of the zFPKM + :param cutoff: The cutoff value to use for the provided filtering technique + :param force_zfpkm_plot: If too many samples exist, should plotting be done anyway? + :param log_level: The level of logging to output + :param log_location: The location to write logs to + :param output_zfpkm_png_filepath: Optional filepath to save zFPKM plots + :return: None + """ + set_up_logging(level=log_level, location=log_location) + + technique = FilteringTechnique(technique) if isinstance(technique, str) else technique + if technique == FilteringTechnique.TPM: + cutoff = cutoff or 25 + if cutoff < 1 or cutoff > 100: + _log_and_raise_error( + "Quantile must be between 1 - 100", + error=ValueError, + level=LogLevel.ERROR, + ) + elif technique == FilteringTechnique.CPM: + if cutoff and cutoff < 0: + _log_and_raise_error( + "Cutoff must be greater than or equal to 0", + error=ValueError, + level=LogLevel.ERROR, + ) + elif cutoff: + cutoff = "default" + elif technique in {FilteringTechnique.ZFPKM, FilteringTechnique.UMI}: + cutoff = cutoff or -3 + else: + _log_and_raise_error( + f"Technique must be one of {','.join(FilteringTechnique)}. Got: {technique.value}", + error=ValueError, + level=LogLevel.ERROR, + ) + + if not input_rnaseq_filepath.exists(): + _log_and_raise_error( + f"Input RNA-seq file not found! Searching for: '{input_rnaseq_filepath}'", + error=FileNotFoundError, + level=LogLevel.ERROR, + ) + + if prep == RNAType.SCRNA and technique.value.lower() != FilteringTechnique.UMI.value.lower(): + logger.warning( + "Single cell filtration does not normalize and assumes " + "genes are counted with Unique Molecular Identifiers (UMIs). " + f"Switching filtering technique from '{technique.value}' to '{FilteringTechnique.UMI.value}'." + ) + technique = FilteringTechnique.UMI + + if isinstance(input_metadata_filepath_or_df, pd.DataFrame): + metadata_df = input_metadata_filepath_or_df + elif isinstance(input_metadata_filepath_or_df, Path): + if input_metadata_filepath_or_df.suffix not in {".xlsx", ".xls"}: + _log_and_raise_error( + f"Expected an excel file with extension of '.xlsx' or '.xls', got '{input_metadata_filepath_or_df.suffix}'.", + error=ValueError, + level=LogLevel.ERROR, + ) + if not input_metadata_filepath_or_df.exists(): + _log_and_raise_error( + f"Input metadata file not found! Searching for: '{input_metadata_filepath_or_df}'", + error=FileNotFoundError, + level=LogLevel.ERROR, + ) + + metadata_df = pd.read_excel(input_metadata_filepath_or_df) + else: + _log_and_raise_error( + f"Expected a pandas DataFrame or Path object as metadata, got '{type(input_metadata_filepath_or_df)}'", + error=TypeError, + level=LogLevel.ERROR, + ) + + metadata_df["fragment_length"] = metadata_df["fragment_length"].astype(np.float32) + metadata_df = metadata_df.groupby("sample_name", as_index=False).agg( + { + "sample_name": "first", + "fragment_length": "mean", + "layout": "first", + "strand": "first", + "study": "first", + "library_prep": "first", + } + ) + logger.debug(f"Starting '{context_name}'") + await _process( + context_name=context_name, + rnaseq_matrix_filepath=input_rnaseq_filepath, + metadata_df=metadata_df, + gene_info_df=await read_file(input_gene_info_filepath), + prep=prep, + taxon=taxon_id, + replicate_ratio=replicate_ratio, + batch_ratio=batch_ratio, + high_replicate_ratio=high_replicate_ratio, + high_batch_ratio=high_batch_ratio, + technique=technique, + cut_off=cutoff, + force_zfpkm_plot=force_zfpkm_plot, + peak_parameters=PeakIdentificationParameters(height=zfpkm_peak_height, distance=zfpkm_peak_distance), + bandwidth=zfpkm_bandwidth, + output_boolean_activity_filepath=output_boolean_activity_filepath, + output_zscore_normalization_filepath=output_zscore_normalization_filepath, + output_zfpkm_png_filepath=output_zfpkm_png_filepath, + ) diff --git a/main/como/rnaseq_preprocess.py b/main/como/rnaseq_preprocess.py new file mode 100644 index 00000000..d6c9fb3d --- /dev/null +++ b/main/como/rnaseq_preprocess.py @@ -0,0 +1,882 @@ +from __future__ import annotations + +import asyncio +import functools +import json +import re +import sys +from collections.abc import Generator +from dataclasses import asdict, dataclass, field +from functools import reduce +from itertools import chain +from pathlib import Path +from typing import Final, Literal, TextIO + +import aiofiles +import numpy as np +import pandas as pd +import scanpy as sc +from fast_bioservices.biothings.mygene import MyGene +from fast_bioservices.pipeline import ensembl_to_gene_id_and_symbol, gene_symbol_to_ensembl_and_gene_id +from loguru import logger + +from como.data_types import PATH_TYPE, LogLevel, RNAType +from como.utils import _log_and_raise_error, listify, read_file, set_up_logging + + +@dataclass +class _STARinformation: + num_unmapped: list[int] + num_multimapping: list[int] + num_no_feature: list[int] + num_ambiguous: list[int] + gene_names: list[str] + count_matrix: pd.DataFrame + + @property + def num_genes(self) -> int: + return len(self.count_matrix) + + @classmethod + async def build_from_tab(cls, filepath: Path) -> _STARinformation: + if filepath.suffix != ".tab": + _log_and_raise_error( + f"Building STAR information requires a '.tab' file; received: '{filepath}'", + error=ValueError, + level=LogLevel.ERROR, + ) + if not filepath.exists(): + _log_and_raise_error( + f"Unable to find the .tab file '{filepath}'", + error=FileNotFoundError, + level=LogLevel.ERROR, + ) + + async with aiofiles.open(filepath) as i_stream: + # Cannot use `asyncio.gather()` here because the order of execution is not guaranteed + unmapped = await i_stream.readline() + multimapping = await i_stream.readline() + no_feature = await i_stream.readline() + ambiguous = await i_stream.readline() + + num_unmapped = [int(i) for i in unmapped.removesuffix("\n").split("\t")[1:]] + num_multimapping = [int(i) for i in multimapping.removesuffix("\n").split("\t")[1:]] + num_no_feature = [int(i) for i in no_feature.removesuffix("\n").split("\t")[1:]] + num_ambiguous = [int(i) for i in ambiguous.removesuffix("\n").split("\t")[1:]] + + df: pd.DataFrame = await read_file( + filepath, + h5ad_as_df=True, + sep="\t", + header=None, + skiprows=4, + names=[ + "ensembl_gene_id", + "unstranded_rna_counts", + "first_read_transcription_strand", + "second_read_transcription_strand", + ], + ) + df = df[~df["ensembl_gene_id"].isna()] + return _STARinformation( + num_unmapped=num_unmapped, + num_multimapping=num_multimapping, + num_no_feature=num_no_feature, + num_ambiguous=num_ambiguous, + gene_names=df["ensembl_gene_id"].values.tolist(), + count_matrix=df, + ) + + +@dataclass +class _StudyMetrics: + study_name: str + count_files: list[Path] + strand_files: list[Path] + _sample_names: list[str] = field(default_factory=list) + _num_samples: int = 0 + + @property + def sample_names(self) -> list[str]: + return self._sample_names + + @property + def num_samples(self): + return self._num_samples + + def __post_init__(self): + self._num_samples = len(self.count_files) + self._sample_names = [f.stem for f in self.count_files] + + if len(self.count_files) != len(self.strand_files): + _log_and_raise_error( + ( + f"Unequal number of count files and strand files for study '{self.study_name}'. " + f"Found {len(self.count_files)} count files and {len(self.strand_files)} strand files." + ), + error=ValueError, + level=LogLevel.ERROR, + ) + + if self.num_samples != len(self.count_files): + _log_and_raise_error( + ( + f"Unequal number of samples and count files for study '{self.study_name}'. " + f"Found {self.num_samples} samples and {len(self.count_files)} count files." + ), + error=ValueError, + level=LogLevel.ERROR, + ) + + if self.num_samples != len(self.strand_files): + _log_and_raise_error( + ( + f"Unequal number of samples and strand files for study '{self.study_name}'. " + f"Found {self.num_samples} samples and {len(self.strand_files)} strand files." + ), + error=ValueError, + level=LogLevel.ERROR, + ) + + if self._num_samples == 1: + _log_and_raise_error( + f"Only one sample exists for study {self.study_name}. Provide at least two samples", + error=ValueError, + level=LogLevel.ERROR, + ) + + self.count_files.sort() + self.strand_files.sort() + self._sample_names.sort() + + +@dataclass(slots=True) +class SampleConfiguration: + sample_name: str + fragment_length: float + layout: str + strand: str + study: str + library_prep: str + + +async def _read_text(path: Path | None, *, default: str, lower: bool = False) -> str: + if path is None: + return default + async with aiofiles.open(path) as f: + txt = (await f.read()).strip() + return txt.lower() if lower else txt + + +def _sample_name_from_filepath(file: Path) -> str: + result = re.search(r".+_S\d+R\d+(r\d+)?", file.stem) + if result: + return result.group() + raise ValueError(f"Could not extract sample name from filepath: {file}") + + +def _require_one(paths: list[Path | None], kind: Literal["layout", "strand", "preparation", "fragment"], label: str) -> Path | None: + if len(paths) == 1: + return paths[0] + if len(paths) == 0: + return None + _log_and_raise_error( + f"Multiple matching {kind} files for {label}, make sure there is only one copy for each replicate in COMO_input", + error=ValueError, + level=LogLevel.ERROR, + ) + + +# def _organize_gene_counts_files(data_dir: Path) -> list[_StudyMetrics]: +def _organize_gene_counts_files(data_dir: Path) -> Generator[_StudyMetrics, None, None]: + gene_count_dir = Path(data_dir, "geneCounts") + strand_dir = Path(data_dir, "strandedness") + + gene_counts_directories: list[Path] = sorted([p for p in gene_count_dir.glob("*") if not p.name.startswith(".")]) + strandedness_directories: list[Path] = sorted([p for p in strand_dir.glob("*") if not p.name.startswith(".")]) + + if len(gene_counts_directories) != len(strandedness_directories): + _log_and_raise_error( + ( + f"Unequal number of gene count directories and strandedness directories. " + f"Found {len(gene_counts_directories)} gene count directories and {len(strandedness_directories)} strandedness directories." + f"\nGene count directory: {gene_count_dir}\nStrandedness directory: {strand_dir}" + ), + error=ValueError, + level=LogLevel.ERROR, + ) + + # For each study, collect gene count files, fragment files, insert size files, layouts, and strandedness information + for gene_dir, strand_dir in zip(gene_counts_directories, strandedness_directories, strict=True): + count_files = list(gene_dir.glob("*.tab")) + strand_files = list(strand_dir.glob("*.txt")) + if len(count_files) == 0: + _log_and_raise_error(f"No count files found for study '{gene_dir.stem}'.", error=ValueError, level=LogLevel.ERROR) + if len(strand_files) == 0: + _log_and_raise_error( + f"No strandedness files found for study '{gene_dir.stem}'.", + error=ValueError, + level=LogLevel.ERROR, + ) + + yield _StudyMetrics( + study_name=gene_dir.stem, + count_files=count_files, + strand_files=strand_files, + ) + + +async def _process_first_multirun_sample(strand_file: Path, all_counts_files: list[Path]) -> pd.Series: + all_star_information: tuple[_STARinformation] = await asyncio.gather(*[_STARinformation.build_from_tab(file) for file in all_counts_files]) + + async with aiofiles.open(strand_file) as f: + strand_information: str = await f.read() + strand_information = strand_information.removesuffix("\n").lower() + if strand_information not in ("none", "first_read_transcription_strand", "second_read_transcription_strand"): + _log_and_raise_error( + ( + f"Unrecognized Strand Information: {strand_information}; " + f"expected 'none', 'first_read_transcription_strand', or 'second_read_transcription_strand'" + ), + error=ValueError, + level=LogLevel.ERROR, + ) + if strand_information == "none": + strand_information = "unstranded_rna_counts" + + df_objs: list[pd.DataFrame] = [] + for star_information in all_star_information: + run_counts = star_information.count_matrix[["ensembl_gene_id", strand_information]] + run_counts.columns = ["ensembl_gene_id", "counts"] + df_objs.append(run_counts) + # sample_count = run_counts if sample_count.empty else sample_count.merge(run_counts, on=["ensembl_gene_id", "counts"], how="outer") + sample_count = reduce(lambda x, y: pd.merge(x, y, on=["ensembl_gene_id", "counts"], how="outer"), df_objs) + + # Set na values to 0 + sample_count = sample_count.fillna(value=np.float32(0)) + sample_count["counts"] = sample_count["counts"].astype(np.float64) + + count_sums = sample_count.groupby("ensembl_gene_id", as_index=False)["counts"].mean() + count_sums["counts"] = np.ceil(count_sums["counts"].astype(np.uint32)) + count_sums.columns = ["ensembl_gene_id", _sample_name_from_filepath(strand_file)] + return count_sums + + +async def _process_standard_replicate(counts_file: Path, strand_file: Path, sample_name: str): + star_information = await _STARinformation.build_from_tab(counts_file) + strand_information = strand_file.read_text().rstrip("\n").lower() + + if strand_information not in ("none", "first_read_transcription_strand", "second_read_transcription_strand"): + _log_and_raise_error( + ( + f"Unrecognized Strand Information: {strand_information}; " + f"expected 'none', 'first_read_transcription_strand', or 'second_read_transcription_strand'" + ), + error=ValueError, + level=LogLevel.ERROR, + ) + + if strand_information == "none": + strand_information = "unstranded_rna_counts" + + sample_count = star_information.count_matrix[["ensembl_gene_id", strand_information]] + sample_count.columns = ["ensembl_gene_id", sample_name] + return sample_count + + +async def _prepare_sample_counts( + sample_name: str, + counts_file: Path, + strand_file: Path, + all_counts_files: list[Path], +) -> pd.Series | pd.DataFrame | None: + # Test if the counts_file is the first run in a multi-run smaple + if re.search(r"R\d+r1", counts_file.as_posix()): + return await _process_first_multirun_sample(strand_file=strand_file, all_counts_files=all_counts_files) + elif re.search(r"R\d+r\d+", counts_file.as_posix()): + return None + else: + return await _process_standard_replicate(counts_file, strand_file, sample_name) + + +async def _create_sample_counts_matrix(metrics: _StudyMetrics) -> pd.DataFrame: + adjusted_index = 0 + counts = await _prepare_sample_counts( + sample_name=metrics.sample_names[0], + counts_file=metrics.count_files[0], + strand_file=metrics.strand_files[0], + all_counts_files=metrics.count_files, + ) + + for i in range(1, metrics.num_samples): + new_counts = await _prepare_sample_counts( + sample_name=metrics.sample_names[i], + counts_file=metrics.count_files[i], + strand_file=metrics.strand_files[i], + all_counts_files=metrics.count_files, + ) + if new_counts is None: + adjusted_index += 1 + continue + + counts: pd.DataFrame = counts.merge(new_counts, on="ensembl_gene_id", how="outer") + counts = counts.fillna(value=0) + + # Remove run number "r\d+" from multi-run names + if re.search(r"R\d+r1", metrics.sample_names[i]): + new_sample_name = re.sub(r"r\d+", "", metrics.sample_names[i]) + old_col_name = counts.columns[i + 1 - adjusted_index] + counts.rename(columns={old_col_name: new_sample_name}, inplace=True) + + return counts + + +async def _write_counts_matrix( + *, + config_df: pd.DataFrame, + como_context_dir: Path, + output_counts_matrix_filepath: Path, + rna: RNAType, +) -> pd.DataFrame: + """Create a counts matrix file by reading gene counts table(s). + + Args: + config_df: Configuration DataFrame containing sample information. + como_context_dir: Path to the COMO_input directory containing gene count files. + output_counts_matrix_filepath: Path where the output counts matrix CSV will be saved. + rna: RNAType enum indicating whether to process 'trna' or 'mrna' samples. + + Returns: + A pandas DataFrame representing the final counts matrix. + """ + counts: list[pd.DataFrame] = await asyncio.gather( + *[_create_sample_counts_matrix(metric) for metric in _organize_gene_counts_files(data_dir=como_context_dir)] + ) + rna_specific_sample_names = set(config_df.loc[config_df["library_prep"] == rna.value, "sample_name"].tolist()) + + final_matrix: pd.DataFrame = functools.reduce(lambda left, right: pd.merge(left, right, on="ensembl_gene_id", how="outer"), counts) + final_matrix.fillna(value=0, inplace=True) + final_matrix.iloc[:, 1:] = final_matrix.iloc[:, 1:].astype(np.uint64) + final_matrix = final_matrix[["ensembl_gene_id", *[col for col in final_matrix.columns if col in rna_specific_sample_names]]] + + output_counts_matrix_filepath.parent.mkdir(parents=True, exist_ok=True) + final_matrix.dropna(inplace=True) + final_matrix.to_csv(output_counts_matrix_filepath, index=False) + logger.success(f"Wrote gene count matrix for '{rna.value}' RNA at '{output_counts_matrix_filepath}'") + return final_matrix + + +async def _create_config_df( # noqa: C901 + context_name: str, + /, + como_context_dir: Path, + gene_count_dirname: str = "geneCounts", + layout_dirname: str = "layouts", + strandedness_dirname: str = "strandedness", + fragment_sizes_dirname: str = "fragmentSizes", + prep_method_dirname: str = "prepMethods", +) -> pd.DataFrame: + """Create configuration sheet. + + The configuration file created is based on the gene counts matrix. + If using zFPKM normalization technique, mean fragment lengths will be fetched + + Args: + context_name: Name of the context, used as a prefix for sample names. + como_context_dir: Path to the COMO_input directory containing subdirectories for + gene counts, layouts, strandedness, fragment sizes, and prep methods. + gene_count_dirname: Name of the subdirectory containing gene count files. + layout_dirname: Name of the subdirectory containing layout files. + strandedness_dirname: Name of the subdirectory containing strandedness files. + fragment_sizes_dirname: Name of the subdirectory containing fragment size files. + prep_method_dirname: Name of the subdirectory containing library preparation method files. + + Returns: + A pandas DataFrame representing the configuration sheet. + """ + label_regex: Final = re.compile(r"(?PS\d{1,3})(?PR\d{1,3})(?Pr\d{1,3})?") + gene_counts: list[Path] = list((como_context_dir / gene_count_dirname).rglob("*.tab")) + if not gene_counts: + _log_and_raise_error( + f"No gene count files found in '{como_context_dir / gene_count_dirname}'", + error=FileNotFoundError, + level=LogLevel.ERROR, + ) + + auxillary_directories = { + "layout": como_context_dir / layout_dirname, + "strand": como_context_dir / strandedness_dirname, + "fragment": como_context_dir / fragment_sizes_dirname, + "prep": como_context_dir / prep_method_dirname, + } + aux_lookup: dict[str, dict[str, Path]] = {kind: {} for kind in auxillary_directories} + for kind, root in auxillary_directories.items(): + kind: str + root: Path + for p in root.rglob("*"): + if p.is_file(): + m = label_regex.search(p.stem) + if m: + aux_lookup[kind][m.group(0)] = p + + rows: list[SampleConfiguration] = [] + for gene_count_path in sorted(gene_counts): + m = label_regex.search(gene_count_path.as_posix()) + if not m: + _log_and_raise_error( + f"Filename '{gene_count_path.name}' does not match contextName_SXRYrZ.tab pattern", + error=ValueError, + level=LogLevel.ERROR, + ) + label = m.group(0) + study_number = m["study"] + rep_number = m["rep"] + sample_id = f"{context_name}_{study_number}{rep_number}" + + layout_path = _require_one([aux_lookup["layout"].get(label)], "layout", label) + strand_path = _require_one([aux_lookup["strand"].get(label)], "strand", label) + prep_path = _require_one([aux_lookup["prep"].get(label)], "preparation", label) + + layout, strand, prep = await asyncio.gather( + *[ + _read_text(layout_path, default="UNKNOWN"), + _read_text(strand_path, default="UNKNOWN"), + _read_text(prep_path, default="total", lower=True), + ], + ) + if prep not in {"total", "mrna"}: + _log_and_raise_error( + f"Prep method must be 'total' or 'mrna' (got '{prep}') for {label}", + error=ValueError, + level=LogLevel.ERROR, + ) + + fragment_label = f"{context_name}_{label}_fragment_size.txt" + frag_paths = [p for p in aux_lookup["fragment"].values() if p.name == fragment_label] + mean_frag = 100.0 + if not frag_paths and prep != RNAType.TRNA.value: + logger.warning(f"No fragment file for '{label}'; defaulting to 100 bp (needed for zFPKM).") + mean_frag = 100.0 + elif len(frag_paths) == 1 and layout == "single-end": + mean_frag = 0.0 + else: # 1-N files, paired end + dfs: list[pd.DataFrame] = await asyncio.gather(*[read_file(f, h5ad_as_df=True, sep="\t", on_bad_lines="skip") for f in frag_paths]) + for df in dfs: + df["meanxcount"] = df["frag_mean"] * df["frag_count"] + counts = np.array([df["frag_count"].sum() for df in dfs]) + means = np.array([(df["meanxcount"] / df["frag_count"].sum()).sum() for df in dfs]) + mean_frag = float(np.average(means, weights=counts)) + + rows.append( + SampleConfiguration( + sample_name=sample_id, + fragment_length=mean_frag, + layout=layout, + strand=strand, + study=study_number, + library_prep=prep, + ) + ) + + df = pd.DataFrame.from_records([asdict(r) for r in rows]).sort_values("sample_name", ignore_index=True) + return df + + # 6-3-25: Intentionally left commented-out code to test its replacement + # gene_counts_dir = como_context_dir / gene_count_dirname + # layout_dir = como_context_dir / layout_dirname + # strandedness_dir = como_context_dir / strandedness_dirname + # fragment_sizes_dir = como_context_dir / fragment_sizes_dirname + # prep_method_dir = como_context_dir / prep_method_dirname + # + # gene_counts_files = list(gene_counts_dir.rglob("*.tab")) + # sample_names: list[str] = [] + # fragment_lengths: list[int | float] = [] + # layouts: list[str] = [] + # strands: list[str] = [] + # groups: list[str] = [] + # preparation_method: list[str] = [] + # + # if len(gene_counts_files) == 0: + # _log_and_raise_error(f"No gene count files found in '{gene_counts_dir}'.", error=FileNotFoundError, level=LogLevel.ERROR) + # + # for gene_count_filename in sorted(gene_counts_files): + # # Match S___R___r___ + # # \d{1,3} matches 1-3 digits + # # (?:r\d{1,3})? optionally matches a "r" followed by three digits + # label = re.findall(r"S\d{1,3}R\d{1,3}(?:r\d{1,3})?", gene_count_filename.as_posix())[0] + # if not label: + # _log_and_raise_error( + # ( + # f"\n\nFilename of '{gene_count_filename}' is not valid. " + # f"Should be 'contextName_SXRYrZ.tab', " + # f"where X is the study/batch number, Y is the replicate number, " + # f"and Z is the run number." + # "\n\nIf not a multi-run sample, exclude 'rZ' from the filename." + # ), + # error=ValueError, + # level=LogLevel.ERROR, + # ) + # + # study_number = re.findall(r"S\d{1,3}", label)[0] + # rep_number = re.findall(r"R\d{1,3}", label)[0] + # run_number = re.findall(r"r\d{1,3}", label) + # + # multi_flag = 0 + # if len(run_number) > 0: + # if run_number[0] != "r1": + # continue + # label_glob = f"{study_number}{rep_number}r*" # S__R__r* + # runs = [run for run in gene_counts_files if re.search(label_glob, run.as_posix())] + # multi_flag = 1 + # frag_files = [] + # + # for run in runs: + # run_number = re.findall(r"R\d{1,3}", run.as_posix())[0] + # replicate = re.findall(r"r\d{1,3}", run.as_posix())[0] + # frag_filename = "".join([context_name, "_", study_number, run_number, replicate, "_fragment_size.txt"]) + # frag_files.append(como_context_dir / fragment_sizes_dirname / study_number / frag_filename) + # + # layout_files: list[Path] = list(layout_dir.rglob(f"{context_name}_{label}_layout.txt")) + # strand_files: list[Path] = list(strandedness_dir.rglob(f"{context_name}_{label}_strandedness.txt")) + # frag_files: list[Path] = list(fragment_sizes_dir.rglob(f"{context_name}_{label}_fragment_size.txt")) + # prep_files: list[Path] = list(prep_method_dir.rglob(f"{context_name}_{label}_prep_method.txt")) + # + # layout = "UNKNOWN" + # if len(layout_files) == 0: + # logger.warning( + # f"No layout file found for {label}, writing as 'UNKNOWN', " + # f"this should be defined if you are using zFPKM or downstream 'rnaseq_gen.py' will not run" + # ) + # elif len(layout_files) == 1: + # with layout_files[0].open("r") as file: + # layout = file.read().strip() + # elif len(layout_files) > 1: + # _log_and_raise_error( + # f"Multiple matching layout files for {label}, make sure there is only one copy for each replicate in COMO_input", + # error=ValueError, + # level=LogLevel.ERROR, + # ) + # + # strand = "UNKNOWN" + # if len(strand_files) == 0: + # logger.warning( + # f"No strandedness file found for {label}, writing as 'UNKNOWN'. " + # f"This will not interfere with the analysis since you have already set rnaseq_preprocess.py to " + # f"infer the strandedness when writing the counts matrix" + # ) + # elif len(strand_files) == 1: + # with strand_files[0].open("r") as file: + # strand = file.read().strip() + # elif len(strand_files) > 1: + # _log_and_raise_error( + # f"Multiple matching strandedness files for {label}, make sure there is only one copy for each replicate in COMO_input", + # error=ValueError, + # level=LogLevel.ERROR, + # ) + # + # prep = "total" + # if len(prep_files) == 0: + # logger.warning(f"No prep file found for {label}, assuming 'total', as in 'Total RNA' library preparation") + # elif len(prep_files) == 1: + # with prep_files[0].open("r") as file: + # prep = file.read().strip().lower() + # if prep not in ["total", "mrna"]: + # _log_and_raise_error( + # f"Prep method must be either 'total' or 'mrna' for {label}", + # error=ValueError, + # level=LogLevel.ERROR, + # ) + # elif len(prep_files) > 1: + # _log_and_raise_error( + # f"Multiple matching prep files for {label}, make sure there is only one copy for each replicate in COMO_input", + # error=ValueError, + # level=LogLevel.ERROR, + # ) + # + # mean_fragment_size = 100 + # if len(frag_files) == 0 and prep != RNAType.TRNA.value: + # logger.warning( + # f"No fragment file found for {label}, using '100'. You should define this if you are going to use downstream zFPKM normalization" + # ) + # elif len(frag_files) == 1: + # if layout == "single-end": + # mean_fragment_size = 0 + # else: + # if not multi_flag: + # frag_df = pd.read_table(frag_files[0], low_memory=False) + # frag_df["meanxcount"] = frag_df["frag_mean"] * frag_df["frag_count"] + # mean_fragment_size = sum(frag_df["meanxcount"] / sum(frag_df["frag_count"])) + # + # else: + # mean_fragment_sizes = np.array([]) + # library_sizes = np.array([]) + # for ff in frag_files: + # frag_df = pd.read_table(ff, low_memory=False, sep="\t", on_bad_lines="skip") + # frag_df["meanxcount"] = frag_df["frag_mean"] * frag_df["frag_count"] + # mean_fragment_size = sum(frag_df["meanxcount"] / sum(frag_df["frag_count"])) + # mean_fragment_sizes = np.append(mean_fragment_sizes, mean_fragment_size) + # library_sizes = np.append(library_sizes, sum(frag_df["frag_count"])) + # + # mean_fragment_size = sum(mean_fragment_sizes * library_sizes) / sum(library_sizes) + # elif len(frag_files) > 1: + # _log_and_raise_error( + # f"Multiple matching fragment files for {label}, make sure there is only one copy for each replicate in COMO_input", + # error=ValueError, + # level=LogLevel.ERROR, + # ) + # + # sample_names.append(f"{context_name}_{study_number}{rep_number}") + # fragment_lengths.append(mean_fragment_size) + # layouts.append(layout) + # strands.append(strand) + # groups.append(study_number) + # preparation_method.append(prep) + # + # out_df = pd.DataFrame( + # { + # "sample_name": sample_names, + # "fragment_length": fragment_lengths, + # "layout": layouts, + # "strand": strands, + # "study": groups, + # "library_prep": preparation_method, + # } + # ).sort_values("sample_name") + # return out_df + + +async def _create_gene_info_file( + *, + counts_matrix_filepaths: list[Path], + output_filepath: Path, + taxon: int, + cache: bool, +): + """Create a gene information file context. + + The gene information file will be created by reading each matrix filepath in the provided list + """ + + async def read_counts(file: Path) -> list[str]: + data = await read_file(file, h5ad_as_df=False) + + try: + if isinstance(data, pd.DataFrame): + conversion = await ensembl_to_gene_id_and_symbol(ids=data["ensembl_gene_id"].tolist(), taxon=taxon) + elif isinstance(data, sc.AnnData): + conversion = await gene_symbol_to_ensembl_and_gene_id(symbols=data.var_names.tolist(), taxon=taxon) + else: + raise TypeError(f"Unsupported data type '{type(data)}' for file '{file}'") + except json.JSONDecodeError: + _log_and_raise_error( + f"Got a JSON decode error for file '{counts_matrix_filepaths}'", + error=ValueError, + level=LogLevel.CRITICAL, + ) + + # Remove NA values from entrez_gene_id dataframe column + return conversion["entrez_gene_id"].dropna().tolist() + + logger.info("Fetching gene info - this can take up to 5 minutes depending on the number of genes and your internet connection") + genes = set(chain.from_iterable(await asyncio.gather(*[read_counts(f) for f in counts_matrix_filepaths]))) + gene_data = await MyGene(cache=cache).query(items=list(genes), taxon=taxon, scopes="entrezgene") + gene_info: pd.DataFrame = pd.DataFrame( + data=None, + columns=["ensembl_gene_id", "gene_symbol", "entrez_gene_id", "size"], + index=list(range(len(gene_data))), + ) + for i, data in enumerate(gene_data): + # ensembl_ids = data.get("genomic_pos.ensemblgene", "-") + ensembl_ids = data.get("genomic_pos.ensemblgene", pd.NA) + if isinstance(ensembl_ids, list): + ensembl_ids = ensembl_ids[0] + + start_pos = data.get("genomic_pos.start", 0) + start_pos: int = int(sum(start_pos) / len(start_pos)) if isinstance(start_pos, list) else int(start_pos) + end_pos = data.get("genomic_pos.end", 0) + end_pos: int = int(sum(end_pos) / len(end_pos)) if isinstance(end_pos, list) else int(end_pos) + + gene_info.at[i, "gene_symbol"] = data.get("symbol", pd.NA) + gene_info.at[i, "entrez_gene_id"] = data.get("entrezgene", pd.NA) + gene_info.at[i, "ensembl_gene_id"] = ensembl_ids + gene_info.at[i, "size"] = end_pos - start_pos + + gene_info = gene_info[((~gene_info["entrez_gene_id"].isna()) & (~gene_info["ensembl_gene_id"].isna()) & (~gene_info["gene_symbol"].isna()))] + gene_info.sort_values(by="ensembl_gene_id", inplace=True) + gene_info.dropna(inplace=True) + + output_filepath.parent.mkdir(parents=True, exist_ok=True) + gene_info.to_csv(output_filepath, index=False) + logger.success(f"Gene Info file written at '{output_filepath}'") + + +async def _process_como_input( + context_name: str, + output_config_filepath: Path, + como_context_dir: PATH_TYPE, + output_counts_matrix_filepath: Path, + rna: RNAType, +) -> None: + config_df = await _create_config_df( + context_name, + como_context_dir=Path(como_context_dir), + ) + + await _write_counts_matrix( + config_df=config_df, + como_context_dir=Path(como_context_dir), + output_counts_matrix_filepath=output_counts_matrix_filepath, + rna=rna, + ) + with pd.ExcelWriter(output_config_filepath) as writer: + subset_config = config_df[config_df["library_prep"] == rna.value] + subset_config.to_excel(writer, sheet_name=context_name, header=True, index=False) + + +async def _process( + context_name: str, + taxon: int, + output_gene_info_filepath: Path, + como_context_dir: Path | None, + input_matrix_filepath: list[Path] | None, + output_trna_config_filepath: Path | None, + output_mrna_config_filepath: Path | None, + output_trna_matrix_filepath: Path | None, + output_mrna_matrix_filepath: Path | None, + *, + cache: bool, + create_gene_info_only: bool, +): + rna_types: list[tuple[RNAType, Path, Path | None]] = [] + if output_trna_config_filepath: + rna_types.append((RNAType.TRNA, output_trna_config_filepath, output_trna_matrix_filepath)) + if output_mrna_config_filepath: + rna_types.append((RNAType.MRNA, output_mrna_config_filepath, output_mrna_matrix_filepath)) + + # if provided, iterate through como-input specific directories + if not create_gene_info_only: + if not como_context_dir: + raise ValueError("`como_context_directory` must not be None if not in `create_gene_info_only` mode") + tasks = [] + for rna, output_config_filepath, output_matrix_filepath in rna_types: + if output_matrix_filepath is None: + logger.warning(f"Not creating RNA type '{rna.value}' because the output matrix filepath was None.") + continue + tasks.append( + asyncio.create_task( + _process_como_input( + context_name=context_name, + output_config_filepath=output_config_filepath, + como_context_dir=Path(como_context_dir), + output_counts_matrix_filepath=output_matrix_filepath, + rna=rna, + ) + ) + ) + + await asyncio.gather(*tasks) + + # create the gene info filepath based on provided data + input_files = [] + if input_matrix_filepath: + input_files.extend(input_matrix_filepath) + if output_trna_matrix_filepath: + input_files.append(output_trna_matrix_filepath) + if output_mrna_matrix_filepath: + input_files.append(output_mrna_matrix_filepath) + + await _create_gene_info_file( + counts_matrix_filepaths=input_files, + output_filepath=output_gene_info_filepath, + taxon=taxon, + cache=cache, + ) + + +async def rnaseq_preprocess( + context_name: str, + taxon: int, + output_gene_info_filepath: Path, + como_context_dir: Path | None = None, + input_matrix_filepath: Path | list[Path] | None = None, + output_trna_metadata_filepath: Path | None = None, + output_mrna_metadata_filepath: Path | None = None, + output_trna_count_matrix_filepath: Path | None = None, + output_mrna_count_matrix_filepath: Path | None = None, + cache: bool = True, + log_level: LogLevel | str = LogLevel.INFO, + log_location: str | TextIO = sys.stderr, + *, + create_gene_info_only: bool = False, +) -> None: + """Preprocesses RNA-seq data for downstream analysis. + + Fetches additional gene information from a provided matrix or gene counts, + or optionally creates this matrix using gene count files obtained using STAR aligner + + :param context_name: The context/cell type being processed + :param taxon: The NCBI taxonomy ID + :param output_gene_info_filepath: Path to the output gene information CSV file + :param output_trna_metadata_filepath: Path to the output tRNA config file (if in "create" mode) + :param output_mrna_metadata_filepath: Path to the output mRNA config file (if in "create" mode) + :param output_trna_count_matrix_filepath: The path to write total RNA count matrices + :param output_mrna_count_matrix_filepath: The path to write messenger RNA count matrices + :param como_context_dir: If in "create" mode, the input path(s) to the COMO_input directory of the current context + i.e., the directory containing "fragmentSizes", "geneCounts", "insertSizeMetrics", etc. directories + :param input_matrix_filepath: If in "provide" mode, the path(s) to the count matrices to be processed~ + :param cache: Should HTTP requests be cached + :param log_level: The logging level + :param log_location: The logging location + :param create_gene_info_only: If True, only create the gene info file and skip general preprocessing steps + """ + set_up_logging(level=log_level, location=log_location) + + output_gene_info_filepath = output_gene_info_filepath.resolve() + + if como_context_dir: + como_context_dir = como_context_dir.resolve() + + input_matrix_filepath = [i.resolve() for i in listify(input_matrix_filepath)] if input_matrix_filepath else None + output_trna_metadata_filepath = output_trna_metadata_filepath.resolve() if output_trna_metadata_filepath else None + output_mrna_metadata_filepath = output_mrna_metadata_filepath.resolve() if output_mrna_metadata_filepath else None + output_trna_count_matrix_filepath = output_trna_count_matrix_filepath.resolve() if output_trna_count_matrix_filepath else None + output_mrna_count_matrix_filepath = output_mrna_count_matrix_filepath.resolve() if output_mrna_count_matrix_filepath else None + + await _process( + context_name=context_name, + taxon=taxon, + como_context_dir=como_context_dir, + input_matrix_filepath=input_matrix_filepath, + output_gene_info_filepath=output_gene_info_filepath, + output_trna_config_filepath=output_trna_metadata_filepath, + output_mrna_config_filepath=output_mrna_metadata_filepath, + output_trna_matrix_filepath=output_trna_count_matrix_filepath, + output_mrna_matrix_filepath=output_mrna_count_matrix_filepath, + cache=cache, + create_gene_info_only=create_gene_info_only, + ) + + +async def _main(): + context_name = "notreatment" + taxon = 9606 + como_context_dir = Path("/Users/joshl/Projects/COMO/main/data/COMO_input/notreatment") + output_gene_info_filepath = Path("/Users/joshl/Projects/COMO/main/data/results/notreatment/gene_info.csv") + output_trna_metadata_filepath = Path("/Users/joshl/Projects/COMO/main/data/config_sheets/trna_config.xlsx") + output_trna_count_matrix_filepath = Path("/Users/joshl/Projects/COMO/main/data/results/notreatment/total-rna/totalrna_notreatment.csv") + + await rnaseq_preprocess( + context_name=context_name, + taxon=taxon, + como_context_dir=como_context_dir, + input_matrix_filepath=None, + output_gene_info_filepath=output_gene_info_filepath, + output_trna_metadata_filepath=output_trna_metadata_filepath, + output_trna_count_matrix_filepath=output_trna_count_matrix_filepath, + cache=False, + log_level="INFO", + ) + + +if __name__ == "__main__": + asyncio.run(_main()) diff --git a/main/como/rpy2_api.py b/main/como/rpy2_api.py new file mode 100644 index 00000000..910552ec --- /dev/null +++ b/main/como/rpy2_api.py @@ -0,0 +1,36 @@ +# ruff: noqa + +import typing +from pathlib import Path + +import rpy2 +import rpy2.robjects.packages + + +class Rpy2: + def __init__(self, r_file_path: Path, *args, **kwargs) -> None: + """ + This class is responsible for providing access to rpy2 + """ + self._r_file_path: Path = r_file_path + self._r_file_read: str = open(self._r_file_path, "r").read() + self._args = args + self._kwargs = kwargs + + def call_function(self, r_function_name: str, *args, **kwargs) -> typing.Any: + """ + Call an R function + Taken in part from: https://gist.github.com/indraniel/da11c4f79c79b5e6bfb8 + """ + if self._args == (): # args is empty + self._args = args + if self._kwargs == {}: + self._kwargs = kwargs + + func_ = rpy2.robjects.packages.SignatureTranslatedAnonymousPackage(self._r_file_read, "func_") + + # Dynamically call the func_ function, using the arguments passed in + # From: https://stackoverflow.com/questions/11781265/ + call_func_ = getattr(func_, r_function_name) + results = call_func_(*self._args, **self._kwargs) + return results diff --git a/main/como/rscripts/DGE.R b/main/como/rscripts/DGE.R new file mode 100644 index 00000000..341a2553 --- /dev/null +++ b/main/como/rscripts/DGE.R @@ -0,0 +1,139 @@ +suppressPackageStartupMessages(library("DESeq2")) +suppressPackageStartupMessages(library("edgeR")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("readxl")) +suppressPackageStartupMessages(library("stringr")) + +work_dir <- getwd() +r_log_directory <- stringr::str_interp("${work_dir}/logs") +if (!dir.exists(r_log_directory)) { dir.create(r_log_directory) } +zz <- file(file.path(r_log_directory, "DGE.Rout"), open = "wt") +sink(zz, type = "message") + +readCountMatrix <- function(cmat_file, config_file, disease_name) { + conf <- readxl::read_excel(config_file, sheet = disease_name, col_names = TRUE) + cmat_whole <- readr::read_csv(cmat_file) + cmat_whole[, -1] <- lapply(cmat_whole[, -1], as.numeric) + cmat_whole <- cmat_whole[rowSums(cmat_whole[, -1]) > 0,] + genes <- as.character(cmat_whole$genes) + samps <- as.character(conf$Sample) + exps <- as.character(conf$Experiment) + if (length(genes) == 0 | is.null(genes)) { + print("disease count matrix must have a column headed 'genes'") + stop() + } + SampMetrics <- list() + for (i in 1:length(samps)) { + entry <- samps[i] + if (entry %in% colnames(cmat_whole)) { + counts <- cmat_whole[entry] + group <- exps[i] + SampMetrics[[group]][[entry]][["Counts"]] <- counts + SampMetrics[[group]][[entry]][["Ensembl"]] <- genes + } else if (paste(c("X", entry), collapse = "") %in% colnames(cmat_whole)) { + entry <- paste(c("X", entry), collapse = "") + counts <- cmat_whole[entry] + group <- exps[i] + SampMetrics[[group]][[entry]][["Counts"]] <- counts + SampMetrics[[group]][[entry]][["Ensembl"]] <- genes + } else { + print(paste0(entry, " not found in disease count matrix")) + } + } + return(SampMetrics) +} + + +dgeAnalysis <- function(SampMetrics, test_name, tissue_name, disease_name) { + gene_list <- SampMetrics[[1]][[1]][["Ensembl"]] + + df <- data.frame(Ensembl = gene_list) + group_list <- c(rep("control", length(SampMetrics[["control"]])), rep('patient', length(SampMetrics[["patient"]]))) + + for (j in 1:length(SampMetrics[["control"]])) { + df <- cbind(df, SampMetrics[["control"]][[j]][["Counts"]]) + } + for (j in 1:length(SampMetrics[["patient"]])) { + df <- cbind(df, SampMetrics[["patient"]][[j]][["Counts"]]) + } + + df[is.na(df)] <- 0 + ensembl <- df["Ensembl"] + df["Ensembl"] <- NULL + df <- data.frame(sapply(df, as.numeric)) + dgList <- DGEList(counts = df, genes = gene_list, group = group_list) + dgList$samples$group <- relevel(dgList$samples$group, ref = "control") + dgList <- calcNormFactors(dgList, method = "TMM") + + tmm <- cpm(dgList) + + dir.create(file.path(work_dir, "data", "results", tissue_name, disease_name), + showWarnings = FALSE + ) + write.csv(cbind(ensembl, tmm), + file.path(work_dir, "data", "results", tissue_name, disease_name, "TMM_Matrix.csv") + ) + + # MDS Plot + plotname <- file.path(work_dir, "data", "results", tissue_name, disease_name, "MDS_plot.jpg") + title <- "DGEList Multi-Dimensional Scaling" + jpeg(plotname) + lab <- colnames(df) + plotMDS(dgList, labels = lab, main = title) + dev.off() + + # create design matrix + designMat <- model.matrix(~0 + group, data = dgList$samples) + colnames(designMat) <- levels(dgList$samples$group) + + # BCV plot + plotname <- file.path(work_dir, "data", "results", tissue_name, disease_name, "BCV_plot.jpg") + title <- "DGEList Biological Coefficient of Variation" + dgList <- estimateGLMCommonDisp(dgList, design = designMat) + dgList <- estimateGLMTrendedDisp(dgList, design = designMat) + dgList <- estimateGLMTagwiseDisp(dgList, design = designMat) + jpeg(plotname) + plotBCV(dgList, main = title) + dev.off() + + # GLM approach + fit <- glmQLFit(dgList, designMat) + gp <- as.character(unique(dgList$samples$group)) + glen <- length(gp) + for (i in 2:glen) { + test_cell <- gp[i] + con <- rep(0, glen) + con[1] <- -1 + con[i] <- 1 + qlf <- glmQLFTest(fit, contrast = con) + edgeR_result <- topTags(qlf, n = 65000) + deGenes <- decideTestsDGE(qlf, adjust.method = "BH", p.value = 0.05) + deGenes <- rownames(qlf)[as.logical(deGenes)] + expTab <- edgeR_result$table + + # save results + names(expTab)[names(expTab) == "PValue"] <- "P.Value" + names(expTab)[names(expTab) == "genes"] <- "Ensembl" + + # smear plot + plotname <- file.path(work_dir, "data", "results", tissue_name, disease_name, "smear_plot.jpg") + title <- paste0("DGEList Smear Plot ", test_cell) + jpeg(plotname) + plotSmear(qlf, de.tags = deGenes, main = title) + abline(h = c(-1, 1), col = 2) + dev.off() + } + return(expTab) +} + + +DGE_main <- function(cmat_file, config_file, context_name, disease_name) { + test_name <- cmat_file + test_name <- unlist(strsplit(test_name, "_RawCounts"))[1] + test_list <- unlist(strsplit(test_name, "/")) + test_name <- test_list[length(test_list)] + SampMetrics <- readCountMatrix(cmat_file, config_file, disease_name) + ensembl_all <- SampMetrics[[1]][[1]][["Ensembl"]] + data_table <- dgeAnalysis(SampMetrics, test_name, context_name, disease_name) + return(data_table) +} diff --git a/main/como/rscripts/cluster_samples.R b/main/como/rscripts/cluster_samples.R new file mode 100644 index 00000000..7994f13f --- /dev/null +++ b/main/como/rscripts/cluster_samples.R @@ -0,0 +1,468 @@ +suppressPackageStartupMessages(library("ggplot2")) +suppressPackageStartupMessages(library("ggrepel")) +suppressPackageStartupMessages(library("tidyverse")) +suppressPackageStartupMessages(library("FactoMineR")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("uwot")) + +# Check if rlogs directory exists, From: https://stackoverflow.com/a/46008094 +# Then prevent messy messages from repeatedly writing to juypter +work_dir <- getwd() +r_log_directory <- str_interp("${work_dir}/logs") +if (!dir.exists(r_log_directory)) { dir.create(r_log_directory) } +zz <- file(file.path(r_log_directory, "cluster_samples.Rout"), open = "wt") +sink(zz, type = "message") + +make_logical_matrix <- function(wd, technique, context_names) { + ### organize logical matrix + files <- NULL + for (context in context_names) { + if (technique == "zfpkm") { + files <- c(files, Sys.glob(file.path(wd, context, "**", "zFPKM_Matrix_*.csv"))) + } else if (technique == "quantile") { + files <- c(files, Sys.glob(file.path(wd, context, "**", "TPM_Matrix_*.csv"))) + } else if (technique == "cpm") { + files <- c(files, Sys.glob(file.path(wd, context, "**", "CPM_Matrix_*.csv"))) + } else { + print("Invalid technique. Must be zfpkm, quantile, or cpm") + stop() + } + } + + is_first <- TRUE + for (f in files) { + # Read the file, strip white space, and set the header + new_matrix <- read.table(f, strip.white = TRUE, header = TRUE, sep = ",", row.names = NULL) %>% + mutate(across(-1, as.numeric)) %>% # Make sure all columns (excluding index 0) are numeric + mutate(ENTREZ_GENE_ID = as.character(ENTREZ_GENE_ID)) %>% # Make index 0 a character + group_by(ENTREZ_GENE_ID) %>% # Group by the index 0 column + summarize(across(everything(), mean)) # For the remaining columns, if there are duplicate IDs, take the average of them + + # Rename \"ENTREZ_GENE_ID\" to ENTREZ_GENE_ID + colnames(new_matrix) <- gsub('\"', '', colnames(new_matrix)) + + if (is_first) { + merge_matrix <- new_matrix + is_first <- FALSE + } else { + merge_matrix <- merge_matrix %>% + dplyr::left_join( + new_matrix, + by = "ENTREZ_GENE_ID" + ) + } + } + + if (technique == "zfpkm") { + cutoff <- -3 + logical_matrix <- do.call( + cbind, + lapply( + 2:ncol(merge_matrix), + function(j) { + as.integer(merge_matrix[, j] > cutoff) + } + ) + ) %>% + as.data.frame(.) %>% + cbind(merge_matrix["ENTREZ_GENE_ID"], .) %>% + na.omit(.) + } else if (technique == "quantile") { + logical_matrix <- do.call( + cbind, + lapply( + 2:ncol(merge_matrix), + function(j) { + cutoff <- quantile(merge_matrix[, j], prob = 1 - quantile / 100) + merge_matrix[, j] > cutoff + } + ) + ) %>% + as.data.frame(.) %>% + cbind(merge_matrix["ENTREZ_GENE_ID"], .) %>% + na.omit(.) + } else if (technique == "cpm") { + logical_matrix <- do.call( + cbind, + lapply( + 2:ncol(merge_matrix), + function(j) { + cutoff <- ifelse( + min_count == "default", + 10e6 / (median(sum(merge_matrix[, j]))), + 1e6 * min_count / (median(sum(merge_matrix[, j]))) + ) + merge_matrix[, j] > cutoff + } + ) + ) %>% + as.data.frame(a.) %>% + cbind(merge_matrix["ENTREZ_GENE_ID"], .) %>% + na.omit(.) + } + colnames(logical_matrix) <- colnames(merge_matrix) + # rsums <- rowSums(logical_matrix[, -1]) + logical_matrix <- logical_matrix %>% + .[rowSums(.[, -1]) < ncol(.) - 1,] %>% + .[rowSums(.[, -1]) > 0,] + + logical_matrix <- t(logical_matrix[, -1]) # tranpose + + return(logical_matrix) +} + + +parse_contexts <- function(logical_matrix) { + contexts <- NULL + batches <- NULL + contexts <- lapply( + row.names(logical_matrix), + function(r) { + c(contexts, unlist(strsplit(r, "_S"))[1]) + } + ) + + batches <- lapply( + row.names(logical_matrix), + function(r) { + c(batches, unlist(strsplit(r, "R\\d+"))[1]) + } + ) + + contexts <- unlist(contexts) + batches <- unique(unlist(batches)) + + return(list(contexts, batches)) +} + + +# MCA +plot_MCA_replicates <- function(logical_matrix, contexts, wd, label) { + mca_results <- MCA(logical_matrix, graph = F) + d <- as.data.frame(mca_results[["ind"]][["coord"]][, 1:2]) + d["contexts"] <- as.data.frame(contexts) + colnames(d) <- c("x", "y", "contexts") + fig_path <- file.path(wd, "figures") + if (!file.exists(fig_path)) { + dir.create(fig_path) + } + + plotname <- file.path(fig_path, "mca_plot_replicates.pdf") + pdf(plotname) + p <- ggplot(d, ggplot2::aes(x = x, y = y, label = row.names(d), color = contexts)) + + geom_point(alpha = 0.7) + + geom_text_repel(max.overlaps = Inf) + + labs(x = "Dim 1", y = "Dim 2") + if (!label) { p <- remove_geom(p, "GeomTextRepel") } + print(p) + dev.off() +} + + +plot_UMAP_replicates <- function(logical_matrix, contexts, wd, label, n_neigh, min_dist) { + n_neigh <- ifelse(n_neigh == "default", as.integer(length(contexts)), n_neigh) + if (n_neigh < 2) { + print("Cannot cluster replicates if n nearest neighbors is < 1!") + stop() + } + + fac_matrix <- do.call( + cbind, + lapply(seq_len(ncol(logical_matrix)), + function(n) { + as.numeric(logical_matrix[, n]) + } + ) + ) + + coords <- data.frame( + uwot::umap( + X = fac_matrix, + n_neighbors = n_neigh, + metric = "euclidean", + min_dist = min_dist + ) + ) %>% cbind(., contexts) + row.names(coords) <- row.names(logical_matrix) + colnames(coords) <- c("x", "y", "contexts") + fig_path <- file.path(wd, "figures") + if (!file.exists(fig_path)) { + dir.create(fig_path) + } + + plotname <- file.path(fig_path, "umap_plot_replicates.pdf") + pdf(plotname) + + if (label) { + p <- ggplot(coords, ggplot2::aes(x = x, y = y, label = row.names(coords), color = contexts)) + + geom_point(alpha = 0.7) + + geom_text_repel(max.overlaps = Inf) + + labs(x = "Dim 1", y = "Dim 2") + } else { + p <- ggplot(coords, ggplot2::aes(x = x, y = y, color = contexts)) + + geom_point(alpha = 0.7) + + labs(x = "Dim 1", y = "Dim 2") } + print(p) + dev.off() +} + + +plot_replicates <- function(logical_matrix, contexts, wd, clust_algo, label, n_neigh = "default", min_dist = 0.01) { + switch( + tolower(clust_algo), + mca = plot_MCA_replicates(logical_matrix, contexts, wd, label), + umap = plot_UMAP_replicates(logical_matrix, contexts, wd, label, n_neigh, min_dist) + ) +} + + +make_batch_logical_matrix <- function(logical_matrix, batches, ratio) { + logical_matrix <- t(logical_matrix) + log_mat_batch <- data.frame(genes = row.names(logical_matrix)) + for (batch in batches) { + batch_log_mat <- logical_matrix[, grep(paste0("^", batch), colnames(logical_matrix))] + if (nrow(batch_log_mat) < 2) { + log_mat_batch <- cbind(log_mat_batch, batch_log_mat) + } else { + log_mat_batch <- cbind(log_mat_batch, (rowSums(batch_log_mat) / ncol(batch_log_mat)) > ratio) + } + } + log_mat_batch["genes"] <- NULL + colnames(log_mat_batch) <- batches + log_mat_batch <- t(log_mat_batch) + + return(log_mat_batch) +} + + +plot_MCA_batches <- function(log_mat_batch, batches, wd, label) { + contexts <- NULL + contexts <- lapply(batches, function(r) { + c(contexts, unlist(strsplit(r, "_S"))[1]) + }) %>% unlist(.) + + mca_results <- MCA(log_mat_batch, graph = F) + d <- as.data.frame(mca_results[["ind"]][["coord"]][, 1:2]) + d["contexts"] <- as.data.frame(contexts) + colnames(d) <- c("x", "y", "contexts") + fig_path <- file.path(wd, "figures") + if (!file.exists(fig_path)) { + dir.create(fig_path) + } + + plotname <- file.path(fig_path, "mca_plot_batches.pdf") + pdf(plotname) + p <- ggplot(d, ggplot2::aes(x = x, y = y, label = row.names(d), color = contexts)) + + geom_point(alpha = 0.7) + + geom_text_repel(max.overlaps = Inf) + + labs(x = "Dim 1", y = "Dim 2") + print(p) + dev.off() + +} + + +plot_UMAP_batches <- function(log_mat_batch, batches, wd, label, n_neigh, min_dist) { + n_neigh <- ifelse(n_neigh == "default", as.integer(length(batches)), n_neigh) + if (n_neigh < 2) { + print("Cannot cluster batches if n nearest neighbors is < 2!") + stop() + } + + contexts <- NULL + contexts <- lapply( + batches, + function(r) { + c(contexts, unlist(strsplit(r, "_S"))[1]) + } + ) %>% unlist(.) + + binary_matrix <- do.call( + cbind, + lapply( + seq_len(ncol(log_mat_batch)), + function(n) { + as.numeric(log_mat_batch[, n]) + } + ) + ) + + coords <- data.frame( + umap( + binary_matrix, + n_neighbors = n_neigh, + init = "pca", + min_dist = min_dist + ) + ) + + row.names(coords) <- row.names(log_mat_batch) + colnames(coords) <- c("x", "y") + fig_path <- file.path(wd, "figures") + if (!file.exists(fig_path)) { + dir.create(fig_path) + } + + plotname <- file.path(fig_path, "umap_plot_batches.pdf") + pdf(plotname) + p <- ggplot(coords, ggplot2::aes(x = x, y = y, label = row.names(coords), color = contexts)) + + geom_point(alpha = 0.7) + + geom_text_repel(max.overlaps = Inf) + + labs(x = "Dim 1", y = "Dim 2") + print(p) + dev.off() +} + + +plot_batches <- function(log_mat_batch, batches, wd, clust_algo, label, n_neigh = "default", min_dist = 0.01) { + switch( + tolower(clust_algo), + mca = plot_MCA_batches(log_mat_batch, batches, wd, label), + umap = plot_UMAP_batches(log_mat_batch, batches, wd, label, n_neigh, min_dist) + ) +} + + +make_context_logical_matrix <- function(log_mat_batch, contexts, ratio) { + contexts <- unique(contexts) + log_mat_batch <- t(log_mat_batch) + log_mat_context <- data.frame(genes = row.names(log_mat_batch)) + for (context in contexts) { + context_log_mat <- as.data.frame(log_mat_batch[, grep(paste0("^", context), colnames(log_mat_batch))]) + if (nrow(context_log_mat) < 2) { + log_mat_context <- cbind(log_mat_context, context_log_mat) + } else { + log_mat_context <- cbind(log_mat_context, (rowSums(context_log_mat) / ncol(context_log_mat)) > ratio) + } + } + log_mat_context["genes"] <- NULL + colnames(log_mat_context) <- contexts + log_mat_context <- t(log_mat_context) + + return(log_mat_context) +} + + +plot_MCA_contexts <- function(log_mat_context, contexts, wd) { + contexts <- unique(contexts) + mca_results <- MCA(log_mat_context, graph = F) + d <- as.data.frame(mca_results[["ind"]][["coord"]][, 1:2]) + d["contexts"] <- as.data.frame(contexts) + colnames(d) <- c("x", "y", "contexts") + fig_path <- file.path(wd, "figures") + if (!file.exists(fig_path)) { + dir.create(fig_path) + } + + plotname <- file.path(fig_path, "mca_plot_contexts.pdf") + pdf(plotname) + p <- ggplot(d, ggplot2::aes(x = x, y = y, label = row.names(d), color = contexts)) + + geom_point(alpha = 0.7) + + geom_text_repel(max.overlaps = Inf) + + labs(x = "Dim 1", y = "Dim 2") + print(p) + dev.off() + +} + + +plot_UMAP_contexts <- function(log_mat_context, contexts, wd, label, n_neigh, min_dist) { + contexts <- unique(contexts) + n_neigh <- ifelse(n_neigh == "default", as.integer(length(contexts)), n_neigh) + if (n_neigh < 2) { + print("Cannot cluster contexts if n nearest neighbors is < 1! Exiting now.") + # Exit cleanly + + q(save = "no") + } + + binary_matrix <- do.call( + cbind, + lapply( + seq_len(ncol(log_mat_context)), + function(n) { + as.numeric(log_mat_context[, n]) + } + ) + ) + + coords <- data.frame( + umap( + binary_matrix, + n_neighbors = n_neigh, + init = "pca", + min_dist = min_dist + ) + ) + + row.names(coords) <- row.names(log_mat_context) + colnames(coords) <- c("x", "y") + fig_path <- file.path(wd, "figures") + if (!file.exists(fig_path)) { + dir.create(fig_path) + } + plotname <- file.path(fig_path, "umap_plot_contexts.pdf") + pdf(plotname) + p <- ggplot(coords, ggplot2::aes(x = x, y = y, label = row.names(coords), color = contexts)) + + geom_point(alpha = 0.7) + + geom_text_repel(max.overlaps = Inf) + + labs(x = "Dim 1", y = "Dim 2") + print(p) + dev.off() +} + + +plot_contexts <- function( + log_mat_context, + wd, + clust_algo, + contexts, + label, + n_neigh = "default", + min_dist = 0.01) +{ + switch( + tolower(clust_algo), + mca = plot_MCA_contexts(log_mat_context, contexts, wd, label), + umap = plot_UMAP_contexts(log_mat_context, contexts, wd, label, n_neigh, min_dist) + ) +} + + +cluster_samples_main <- function( + wd, + context_names, + technique, + clust_algo, + label, + min_dist = 0.01, + n_neigh_rep = "default", + n_neigh_batch = "default", + n_neigh_cont = "default", + rep_ratio = 0.5, + batch_ratio = 0.5, + quantile = 25, + min_count = "default", + seed = 12345) +{ + set.seed(seed) + + print("Making logical matrix") + logical_matrix <- make_logical_matrix(wd, technique, context_names) + + res <- parse_contexts(logical_matrix) + contexts <- res[[1]] + batches <- res[[2]] + print("Clustering replicate level filtered data") + plot_replicates(logical_matrix, contexts, wd, clust_algo, label, n_neigh = n_neigh_rep, min_dist = min_dist) + + print("Clustering batch level filtered data") + log_mat_batch <- make_batch_logical_matrix(logical_matrix, batches, rep_ratio) + rm(logical_matrix) + plot_batches(log_mat_batch, batches, wd, clust_algo, label, n_neigh = n_neigh_batch, min_dist = min_dist) + + print("Clustering context level filtered data") + log_mat_context <- make_context_logical_matrix(log_mat_batch, contexts, batch_ratio) + rm(log_mat_batch) + plot_contexts(log_mat_context, wd, clust_algo, contexts, label, n_neigh = n_neigh_cont, min_dist = min_dist) +} diff --git a/main/como/rscripts/cluster_sources.R b/main/como/rscripts/cluster_sources.R new file mode 100644 index 00000000..10534e03 --- /dev/null +++ b/main/como/rscripts/cluster_sources.R @@ -0,0 +1,119 @@ +# This file will cluster transcriptomic and proteomic sources +# It will perform the following functions +# 1. reads the zFPKMs (or TPMs or CPMs) for each replicate +# *binarizes them if given that arg (see RNAseq.R using the kOverA function) +# 2. clusters all replicates for all batches/studies across all contexts/cell types within one data source at a time +# 3. clusters all replicates for all batches/studies across all data sources within one cell type at a time +# 4. clusters all batches/studies across all contexts/cell types within one data source at a time +# 5. clusters all batches/studies across all data sources within one cell type at a time (edited) + +suppressPackageStartupMessages(library("stringr")) +suppressPackageStartupMessages(library("tools")) + +get_study_value <- function(file_path) { + # This function will get the S## value from the file path + + # Find "S##.csv", but use a lookahead to avoid "taking" the ".csv" portion + match <- stringr::str_extract(string = toupper(file_path), pattern = "S\\d{1,2}(?=\\.CSV)") + + if (!is.na(match)) + match <- toupper(match) + + return(match) +} + +get_replicate_files <- function(results_directory, context_names, source_type, use_trna, use_mrna) { + + # This function will get the file paths of zFPKMs for each replicate + all_context_files <- list() + for (context_name in context_names) { + lower_context_name <- tolower(context_name) + source_type <- tolower(source_type) + + current_context_files <- list.files(file.path(results_directory, lower_context_name), full.names = TRUE, recursive = TRUE) + context_files <- c() + + for (file in current_context_files) { + file_name <- tolower(file) + + # Check the current file meets our criteria + if ( + tools::file_ext(file_name) == "csv" && # Ensure the current file is a CSV file + grepl(lower_context_name, file_name) && # Test if the current file is part of the current context (i.e., naiveB, immNK) + grepl(source_type, file_name) && # Test if the current file has source type (zFPKM, TPM, CPM) + # Create a "group" that works if either trna or mrna is TRUE + ( + (use_trna && grepl("total", file_name)) || # Test if the current file is a total-rna file + (use_mrna && grepl("mrna", file_name)) # Test if the current file is an mRNA (polyA) file + ) + ) { + context_files <- append(context_files, file) + } + } + + # Only append new list if context_files has at least one item + if (length(context_files) > 0) + all_context_files[[context_name]] <- context_files + } + + # Return list if it has at least one item, otherwise return "NA" + if (length(all_context_files) > 0) { + return(all_context_files) + } else { + return(NA) + } +} + +read_matrix_values <- function(study_files) { + # This function is responsible for reading in the matrix values found within the replicate files + # It takes the list of replicate files and returns a list of lists of matrix values + replicate_dataframes <- list() + context_names <- names(study_files) + + for (context in context_names) { + index <- 1 + context_files <- study_files[[context]] + context_dataframe <- c() + for (file in context_files) { + dataframe <- read.csv(file, header = TRUE) + + study <- get_study_value(file) + context_dataframe[[study]] <- dataframe + + index <- index + 1 + } + + if (length(context_dataframe) > 0) { + replicate_dataframes[[context]] <- context_dataframe + } + } + if (length(replicate_dataframes) > 0) { + return(replicate_dataframes) + } else { + return(NA) + } +} + + +cluster_sources_main <- function( + results_directory, + context_names, + source_type, + use_trna, + use_mrna, + binarize_data +) { + + study_files <- get_replicate_files(results_directory = results_directory, context_names = context_names, source_type = source_type, use_trna = use_trna, use_mrna = use_mrna) + study_dataframes <- read_matrix_values(study_files = study_files) + print("DONE") +} + + +results_directory <- "/Users/joshl/docker/madrid/local_files/results" +context_names <- list("immNK", "naiveB") +source_type <- "zFPKM" +use_trna <- TRUE +use_mrna <- TRUE +binarize_data <- FALSE +cluster_sources_main(results_directory = results_directory, context_names = context_names, source_type = source_type, use_trna = use_trna, use_mrna = use_mrna, binarize_data = binarize_data) diff --git a/main/como/rscripts/combine_distributions.R b/main/como/rscripts/combine_distributions.R new file mode 100644 index 00000000..54a888dd --- /dev/null +++ b/main/como/rscripts/combine_distributions.R @@ -0,0 +1,621 @@ +suppressPackageStartupMessages(library("stringr")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("tidyverse")) +suppressPackageStartupMessages(library("ggplot2")) + +# Check if rlogs directory exists, From: https://stackoverflow.com/a/46008094 +# Then prevent messy messages from repeatedly writing to juypter +work_dir <- getwd() +r_log_directory <- stringr::str_interp("${work_dir}/logs") +if (!dir.exists(r_log_directory)) { dir.create(r_log_directory) } +zz <- file(file.path(r_log_directory, "combine_distributions.Rout"), open = "wt") +sink(zz, type = "message") + + +get_batch_name <- function(x) { + basename(x) + return(substring(basename(x), 1, nchar(basename(x)) - 4)) +} + +parse_contexts_zfpkm <- function(wd, contexts, prep) { + + batches <- list() + for (context in contexts) { + files <- Sys.glob(file.path(wd, context, prep, paste0("zFPKM_Matrix_", prep, "_*.csv"))) + batches[[context]] <- unlist(lapply(files, get_batch_name)) + } + + return(batches) +} + + +parse_contexts_zumi <- function(wd, contexts, prep) { + + batches <- list() + for (context in contexts) { + files <- Sys.glob(file.path(wd, context, prep, paste0("zUMI_Matrix_", prep, "_*.csv"))) + batches[[context]] <- unlist(lapply(files, get_batch_name)) + } + + return(batches) +} + + +parse_contexts_zscore_prot <- function(wd, contexts) { + + batches <- list() + # files <- Sys.glob(file.path(wd, "*", "proteomics", "protein_zscore_Matrix_*.csv")) + for (context in contexts) { + files <- Sys.glob(file.path(wd, context, "proteomics", "protein_zscore_Matrix_*.csv")) + batches[[context]] <- unlist(lapply(files, get_batch_name)) + } + + return(batches) +} + + +merge_batch <- function(wd, context, batch) { + files <- Sys.glob(file.path(wd, paste0("*", batch, "*"))) + nrep <- c() + stopifnot(length(files) > 0) + + for (f in files) { + zmat <- read.table(f, strip.white = T, header = T, sep = ",", row.names = NULL) %>% # read expression matrix + mutate(across(colnames(.)[-1], as.numeric)) %>% # ensure expression values are numbers + mutate(across(colnames(.)[1], as.character)) %>% # ensure ENTREZ_GENE_IDs are character + group_by(ENTREZ_GENE_ID) %>% + summarise_each(funs(max)) %>% # if multiple of same ENTREZ_GENE_ID, take max value + na.omit(.) %>% + as.data.frame(.) + + nrep <- c(nrep, ncol(zmat) - 1) + entrez_gene <- zmat[, "ENTREZ_GENE_ID"] + rep_names <- colnames(zmat) + zmat <- do.call(cbind, lapply(2:ncol(zmat), function(j) { + repz <- zmat[, j] + })) %>% + cbind(as.character(entrez_gene), .) %>% + as.data.frame(.) %>% + na.omit(.) + + colnames(zmat) <- rep_names + + stack_df <- do.call(rbind, lapply(2:ncol(zmat), function(j) { + repz <- as.numeric(as.character(zmat[, j])) + cbind(ENTREZ_GENE_ID = zmat[, "ENTREZ_GENE_ID"], zscore = repz, source = rep(colnames(zmat)[j], length(repz))) + })) %>% as.data.frame(.) + + stack_df$zscore <- as.numeric(as.character(stack_df$zscore)) + + plot_name_pdf <- file.path( + wd, + "figures", + paste0( + "plot_", context, "_", + substring( + basename(f), + 1, + nchar(basename(f)) - 4 + ), + ".pdf" + ) + ) + + plot_name_png <- file.path( + wd, + "figures", + paste0( + "plot_", + context, + "_", + substring( + basename(f), + 1, + nchar(basename(f)) - 4 + ), + ".png" + ) + ) + + + pdf(plot_name_pdf) + png( + plot_name_png, + res = 1200, + units = "in", + width = 3.25, + height = 3.25, + type = "cairo" + ) + # label <- colnames(stack_df)[-1] + simplified_plot <- ifelse(length(unique(stack_df$source)) > 10, TRUE, FALSE) + plot <- ggplot(stack_df, aes(zscore, color = source)) + + geom_density() + + theme(text = element_text(size = 12, family = "sans")) + if (simplified_plot) { + plot <- plot + theme(legend.position = "none") + } + max_dens <- 0 + # get y upper limit by finding density of peak at z = 0 + for (source in unique(plot$data$source)) { + source_z <- plot$data$zscore[plot$data$source == source] %>% .[!is.na(.)] + source_densx <- density(source_z)$x + source_densy <- density(source_z)$y + idx <- min(max(which(source_densx <= 0)), min(which(source_densx == 0))) + max_d <- source_densy[idx] + if (max_d > max_dens) { + max_dens <- max_d + } + } + # plot <- plot + ylim(0, 1.5*max_dens) + dev.off() + } + + return(list(zmat, nrep)) +} + + +combine_batch_zdistro <- function(wd, context, batch, zmat) { + plot_name_pdf <- file.path(wd, "figures", paste0("plot_", context, "_", batch, "_combine_distro", ".pdf")) + plot_name_png <- file.path(wd, "figures", paste0("plot_", context, "_", batch, "_combine_distro", ".png")) + + weighted_z <- function(x) { + floor_score <- -6 + ceil_score <- 6 + x <- as.numeric(x) + numer <- sum(x) + denom <- sqrt(length(x)) + result <- numer / denom + if (result < floor_score) { result <- floor_score } + if (result > ceil_score) { result <- ceil_score } + return(result) + } + + if (ncol(zmat) > 2) { + combine_z <- apply(zmat[, -1], 1, weighted_z) + merge_df <- cbind(zmat, combined = combine_z) + combine_z <- cbind(ENTREZ_GENE_ID = as.character(zmat[, "ENTREZ_GENE_ID"]), combine_z) + + stack_df <- do.call(rbind, lapply(2:ncol(merge_df), function(j) { + repz <- as.numeric(as.character(merge_df[, j])) + cbind(ENTREZ_GENE_ID = merge_df[, "ENTREZ_GENE_ID"], zscore = repz, source = rep(colnames(merge_df)[j], length(repz))) + })) %>% as.data.frame(.) + stack_df$zscore <- as.numeric(as.character(stack_df$zscore)) + + simplified_plot <- ifelse(length(unique(stack_df$source)) > 10, TRUE, FALSE) + label <- colnames(stack_df)[-1] + pdf(plot_name_pdf) + png( + plot_name_png, + res = 1200, + units = "in", + width = 3.25, + height = 3.25, + type = "cairo" + ) + + if (simplified_plot) { + #p <- p + theme(legend.position = "none") + stack_df <- stack_df[stack_df$source == "combined",] + } + + p <- ggplot(stack_df, aes(zscore, color = source)) + + geom_density() + + theme(text = element_text(size = 12, family = "sans")) + + max_dens <- 0 + # get y upper limit by finding density of peak at z = 0 + for (source in unique(p$data$source)) { + source_z <- p$data$zscore[p$data$source == source] %>% .[!is.na(.)] #%>% .[!is.nan(.)] + source_densx <- density(source_z)$x + source_densy <- density(source_z)$y + idx <- min(max(which(source_densx <= 0.5)), min(which(source_densx == 0.5))) + max_d <- source_densy[idx] + if (max_d > max_dens) { max_dens <- max_d } + } + #p <- p + ylim(0, 1.5*max_dens) + print(p) + dev.off() + + } else { + combine_z <- zmat + } + + return(as.data.frame(combine_z)) +} + + +combine_context_zdistro <- function(wd, context, n_reps, zmat) { + plot_name_pdf <- file.path(wd, "figures", paste0( + "plot_", context, "_combine_batches_distro", ".pdf")) + plot_name_png <- file.path(wd, "figures", paste0( + "plot_", context, "_combine_batches_distro", ".png")) + + weighted_z <- function(x, n_reps) { + floor_score <- -6 + ceil_score <- 6 + x <- as.numeric(x) + nas <- sort(unique(c(which(is.nan(x)), which(is.na(x))))) + weights <- c() + for (i in seq_along(n_reps)) { weights <- c(weights, (n_reps[i]) / sum(n_reps)) } + if (length(nas) > 0) { + x <- x[-nas] + weights <- weights[-nas] + } + numer <- sum(weights * x) + denom <- sqrt(sum(weights^2)) + result <- numer / denom + if (result < floor_score) { result <- floor_score } + if (result > ceil_score) { result <- ceil_score } + return(result) + } + + if (ncol(zmat) > 2) { + combine_z <- apply(zmat[, -1], 1, weighted_z, n_reps = n_reps) + merge_df <- cbind(zmat, combined = combine_z) + combine_z <- cbind(ENTREZ_GENE_ID = as.character(zmat[, "ENTREZ_GENE_ID"]), combine_z) + + stack_df <- do.call(rbind, lapply(2:ncol(merge_df), function(j) { + repz <- as.numeric(as.character(merge_df[, j])) + cbind(ENTREZ_GENE_ID = merge_df[, 1], zscore = repz, source = rep(colnames(merge_df)[j], length(repz))) + })) %>% as.data.frame(.) + stack_df$zscore <- as.numeric(as.character(stack_df$zscore)) + + label <- colnames(stack_df)[-1] + pdf(plot_name_pdf) + png( + plot_name_png, + res = 1200, + units = "in", + width = 3.25, + height = 3.25, + type = "cairo" + ) + p <- ggplot(stack_df, aes(zscore, color = source)) + + geom_density() + + theme(text = element_text(size = 12, family = "sans")) + + max_dens <- 0 + # get y upper limit by finding density of peak at z = 0 + for (source in unique(p$data$source)) { + source_z <- p$data$zscore[p$data$source == source] %>% .[!is.na(.)] #%>% .[!is.nan(.)] + source_densx <- density(source_z)$x + source_densy <- density(source_z)$y + idx <- min(max(which(source_densx <= 0)), min(which(source_densx == 0))) + max_d <- source_densy[idx] + if (max_d > max_dens) { max_dens <- max_d } + } + #p <- p + ylim(0, 1.5*max_dens) + print(p) + dev.off() + + } else { + combine_z <- zmat + colnames(combine_z) <- c("ENTREZ_GENE_ID", "combine_z") + } + + return(as.data.frame(combine_z)) +} + + +combine_omics_zdistros <- function( + wd, + context, + comb_batches_z_trna, + comb_batches_z_mrna, + comb_batches_z_scrna, + comb_batches_z_prot, + tweight, + mweight, + sweight, + pweight, + keep_gene_scores = TRUE) { + + + fig_path <- file.path(wd, context, "figures") + if (!file.exists(fig_path)) { + dir.create(fig_path, recursive = TRUE) + } + plot_name_pdf <- file.path(fig_path, paste0("plot_", context, "_combine_omics_distro", ".pdf")) + plot_name_png <- file.path(fig_path, paste0("plot_", context, "_combine_omics_distro", ".png")) + + weights <- c() + names <- c() + dfs <- list() + counter <- 0 + if (tweight > 0) { + counter <- counter + 1 + weights <- c(weights, tweight) + names <- c(names, "total") + dfs[[counter]] <- comb_batches_z_trna + } + if (mweight > 0) { + counter <- counter + 1 + weights <- c(weights, mweight) + names <- c(names, "polyA") + dfs[[counter]] <- comb_batches_z_mrna + } + if (sweight > 0) { + counter <- counter + 1 + weights <- c(weights, sweight) + names <- c(names, "singleCell") + dfs[[counter]] <- comb_batches_z_scrna + } + if (pweight > 0) { + counter <- counter + 1 + weights <- c(weights, pweight) + names <- c(names, "proteome") + dfs[[counter]] <- comb_batches_z_prot + } + + weighted_z <- function(x, weights) { + floor_score <- -6 + ceil_score <- 10 + x <- as.numeric(x) + + nas <- which(is.na(x)) + if (length(nas) > 0) { + x <- x[-nas] + weights <- weights[-nas] + } + weights <- weights / sum(weights) + numer = sum(weights * x) + denom = sqrt(sum(weights^2)) + result <- numer / denom + if (result < floor_score) { result <- floor_score } + if (result > ceil_score) { result <- ceil_score } + return(result) + } + + for (i in 1:counter) { + add_df <- dfs[[i]] + colnames(add_df)[2] <- names[i] + if (i == 1) { zmat <- add_df } + else { zmat <- full_join(zmat, add_df, by = "ENTREZ_GENE_ID", copy = TRUE) } + } + + if (ncol(zmat) > 2) { + combine_z <- apply(zmat[, -1], 1, weighted_z, weights = weights) + } else { + combine_z = zmat[, -1] + } + + merge_df <- cbind(zmat, combined = combine_z) + combine_z <- cbind(ENTREZ_GENE_ID = as.character(zmat[, "ENTREZ_GENE_ID"]), combine_z) + + stack_df <- do.call(rbind, lapply(2:ncol(merge_df), function(j) { + repz <- as.numeric(as.character(merge_df[, j])) + cbind(ENTREZ_GENE_ID = merge_df[, 1], zscore = repz, source = rep(colnames(merge_df)[j], length(repz))) + })) %>% as.data.frame(.) + stack_df$zscore <- as.numeric(as.character(stack_df$zscore)) + + label <- colnames(stack_df)[-1] + pdf(plot_name_pdf) + png( + plot_name_png, + res = 1200, + units = "in", + width = 3.25, + height = 3.25, + type = "cairo" + ) + + p <- ggplot(stack_df, aes(zscore, color = source)) + + geom_density() + + theme(text = element_text(size = 12, family = "sans")) + + max_dens <- 0 + # get y upper limit by finding density of peak at z = 0 + for (source in unique(p$data$source)) { + source_z <- p$data$zscore[p$data$source == source] %>% .[!is.na(.)] #%>% .[!is.nan(.)] + source_densx <- density(source_z)$x + source_densy <- density(source_z)$y + idx <- min(max(which(source_densx <= 0)), min(which(source_densx == 0))) + max_d <- source_densy[idx] + if (max_d > max_dens) { max_dens <- max_d } + } + #p <- p + ylim(0, 1.5*max_dens) + print(p) + dev.off() + + #if ( keep_gene_scores ) { + # combine_z <- rbind(combine_z, comb_batches_z_rna[!(comb_batches_z_rna$ENTREZ_GENE_ID %in% combine_z$ENTREZ_GENE_ID), "ENTREZ_GENE_ID"]) + #} + + return(combine_z) +} + + +combine_zscores_main <- function( + working_dir, + context_names, + global_use_mrna, + global_use_trna, + global_use_scrna, + global_use_proteins, + keep_gene_scores, + global_trna_weight, + global_mrna_weight, + global_scrna_weight, + global_protein_weight +) { + + figure_output_dir <- file.path(working_dir, "figures") + if (!file.exists(figure_output_dir)) { dir.create(figure_output_dir) } + + global_trna_batches <- parse_contexts_zfpkm(working_dir, context_names, "total") + global_mrna_batches <- parse_contexts_zfpkm(working_dir, context_names, "mrna") + global_scrna_batches <- parse_contexts_zumi(working_dir, context_names, "scrna") + global_protein_batches <- parse_contexts_zscore_prot(working_dir, context_names) + + for (context in context_names) { + context_use_trna <- global_use_trna + context_use_mrna <- global_use_mrna + context_use_scrna <- global_use_scrna + context_use_proteins <- global_use_proteins + + context_trna_weight <- global_trna_weight + context_mrna_weight <- global_mrna_weight + context_scrna_weight <- global_scrna_weight + context_protein_weight <- global_protein_weight + + context_trna_batch <- global_trna_batches[[context]] + context_mrna_batch <- global_mrna_batches[[context]] + context_scrna_batch <- global_scrna_batches[[context]] + context_protein_batch <- global_protein_batches[[context]] + + + if (length(context_trna_batch) == 0 & global_use_trna) { + context_use_trna <- FALSE + print(paste0("No total RNA-seq zFPKM Matrix files found for ", context, ". Will not use for this context.")) + } + + if (length(context_mrna_batch) == 0 & global_use_mrna) { + context_use_mrna <- FALSE + print(paste0("No polyA RNA-seq zFPKM Matrix files found for ", context, ". Will not use for this context.")) + } + + if (length(context_scrna_batch) == 0 & global_use_scrna) { + context_use_scrna <- FALSE + print(paste0("No SC RNA-seq zFPKM Matrix files found for ", context, ". Will not use for this context.")) + } + + if (length(context_protein_batch) == 0 & global_use_proteins) { + context_use_proteins <- FALSE + print(paste0("No proteomics z-score Matrix files found for ", context, ". Will not use for this context.")) + } + + if (context_use_trna) { + print("Will merge total RNA-seq distributions") + trna_workdir <- file.path(working_dir, context, "total") + num_reps <- c() + count <- 0 + for (batch in context_trna_batch) { + res <- merge_batch(trna_workdir, context, batch) + zmat <- res[[1]] + num_reps <- c(num_reps, res[[2]]) + comb_z <- combine_batch_zdistro(trna_workdir, context, batch, zmat) + colnames(comb_z) <- c("ENTREZ_GENE_ID", batch) + if (!count) { merge_z <- comb_z } + else { merge_z <- full_join(merge_z, comb_z, by = "ENTREZ_GENE_ID") } + count <- count + 1 + } + + comb_batches_z_trna <- combine_context_zdistro(trna_workdir, context, num_reps, merge_z) + filename <- file.path(trna_workdir, paste0("combined_zFPKM_", context, ".csv")) + write.csv(comb_batches_z_trna, filename, row.names = FALSE) + + if (!context_use_proteins & !context_use_mrna & !context_use_scrna) { + filename <- file.path(working_dir, context, "total", paste0("model_scores_", context, ".csv")) + write.csv(comb_batches_z_trna, filename, row.names = FALSE) + } + + } else { comb_batches_z_trna <- NA } + + + if (context_use_mrna) { + print("Will merge polyA enriched RNA-seq distributions") + mrna_workdir <- file.path(working_dir, context, "mrna") + num_reps <- c() + count <- 0 + for (batch in context_mrna_batch) { + res <- merge_batch(mrna_workdir, context, batch) + zmat <- res[[1]] + num_reps <- c(num_reps, res[[2]]) + comb_z <- combine_batch_zdistro(mrna_workdir, context, batch, zmat) + colnames(comb_z) <- c("ENTREZ_GENE_ID", batch) + if (!count) { merge_z <- comb_z } + else { merge_z <- full_join(merge_z, comb_z, by = "ENTREZ_GENE_ID") } + count <- count + 1 + } + + comb_batches_z_mrna <- combine_context_zdistro(mrna_workdir, context, num_reps, merge_z) + filename <- file.path(mrna_workdir, paste0("combined_zFPKM_", context, ".csv")) + write.csv(comb_batches_z_mrna, filename, row.names = FALSE) + + if (!context_use_proteins & !context_use_trna & !context_use_scrna) { + filename <- file.path(mrna_workdir, paste0("model_scores_", context, ".csv")) + write.csv(comb_batches_z_mrna, filename, row.names = FALSE) + } + + } else { comb_batches_z_mrna <- NA } + + + if (context_use_scrna) { + print("Will merge single-cell RNA-seq distributions") + scrna_workdir <- file.path(working_dir, context, "scrna") + num_reps <- c() + count <- 0 + for (batch in context_scrna_batch) { + res <- merge_batch(scrna_workdir, context, batch) + zmat <- res[[1]] + num_reps <- c(num_reps, res[[2]]) + comb_z <- combine_batch_zdistro(scrna_workdir, context, batch, zmat) + colnames(comb_z) <- c("ENTREZ_GENE_ID", batch) + if (!count) { merge_z <- comb_z } + else { merge_z <- full_join(merge_z, comb_z, by = "ENTREZ_GENE_ID") } + count <- count + 1 + } + + comb_batches_z_scrna <- combine_context_zdistro(scrna_workdir, context, num_reps, merge_z) + filename <- file.path(scrna_workdir, paste0("combined_zFPKM_", context, ".csv")) + write.csv(comb_batches_z_scrna, filename, row.names = FALSE) + + if (!context_use_proteins & !context_use_trna & !context_use_mrna) { + filename <- file.path(scrna_workdir, paste0("model_scores_", context, ".csv")) + write.csv(comb_batches_z_scrna, filename, row.names = FALSE) + } + + } else { comb_batches_z_scrna <- NA } + + + if (context_use_proteins) { + print("Will merge protein abundance distributions") + protein_workdir <- file.path(working_dir, context, "proteomics") + num_reps <- c() + count <- 0 + for (batch in context_protein_batch) { + res <- merge_batch(protein_workdir, context, batch) + zmat <- res[[1]] + num_reps <- c(num_reps, res[[2]]) + comb_z <- combine_batch_zdistro(protein_workdir, context, batch, zmat) + colnames(comb_z) <- c("ENTREZ_GENE_ID", batch) + if (!count) { merge_z <- comb_z } + else { merge_z <- full_join(merge_z, comb_z, by = "ENTREZ_GENE_ID") } + count <- count + 1 + } + + comb_batches_z_prot <- combine_context_zdistro(protein_workdir, context, num_reps, merge_z) + filename <- file.path(protein_workdir, paste0("combined_zscore_proteinAbundance_", context, ".csv")) + write.csv(comb_batches_z_prot, filename, row.names = FALSE) + + if (!context_use_mrna & !context_use_trna & !context_use_scrna) { + filename <- file.path(protein_workdir, paste0("model_scores_", context, ".csv")) + write.csv(comb_batches_z_prot, filename, row.names = FALSE) + } + + } else { comb_batches_z_prot <- NA } + + if (!context_use_trna) { context_trna_weight <- 0 } + if (!context_use_mrna) { context_mrna_weight <- 0 } + if (!context_use_scrna) { context_scrna_weight <- 0 } + if (!context_use_proteins) { context_protein_weight <- 0 } + + comb_omics_z <- combine_omics_zdistros( + working_dir, + context, + comb_batches_z_trna, + comb_batches_z_mrna, + comb_batches_z_scrna, + comb_batches_z_prot, + context_trna_weight, + context_mrna_weight, + context_scrna_weight, + context_protein_weight + ) + + filename <- file.path(working_dir, context, paste0("model_scores_", context, ".csv")) + write.csv(comb_omics_z, filename, row.names = FALSE) + } + +} \ No newline at end of file diff --git a/main/como/rscripts/protein_transform.R b/main/como/rscripts/protein_transform.R new file mode 100644 index 00000000..ce98b5e3 --- /dev/null +++ b/main/como/rscripts/protein_transform.R @@ -0,0 +1,192 @@ +suppressPackageStartupMessages(library("ggplot2")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("tidyverse")) +suppressPackageStartupMessages(library("zoo")) + +# Check if rlogs directory exists, From: https://stackoverflow.com/a/46008094 +# Then prevent messy messages from repeatedly writing to juypter +work_dir <- getwd() +r_log_directory <- str_interp("${work_dir}/logs") +if (!dir.exists(r_log_directory)) { dir.create(r_log_directory) } +zz <- file(file.path(r_log_directory, "protein_transform.Rout"), open="wt") +sink(zz, type="message") + +z_result <- function(z_vector, density, mu, stdev, max_y) { + z_res <- list( + z = z_vector, + d = density, + m = mu, + s = stdev, + max_y = max_y + ) + + class(z_vector) <- append(class(z_res), "zFPKM") + return(z_res) +} + + +z_score_calc <- function(abundance, min_thresh) { + if (!is.numeric(abundance)) { + stop("argument 'abundance' must be numeric") + } + abundance <- abundance + 1 + log_abundance_filt <- log(abundance[abundance>min_thresh], base=2) + log_abundance <- log(abundance, base=2) + d <- density(log_abundance_filt) + + # calculate rolling average + perc <- as.integer(0.01*length(d[["y"]]) + 1) # 10% roll avg interval + d[["roll_y"]] <- zoo::rollmean(d[["y"]], perc) + + # from https://stats.stackexchange.com/questions/22974/how-to-find-local-peaks-valleys-in-a-series-of-data + find_maxima <- function (x, m = 1){ + shape <- diff(sign(diff(as.numeric(x), na.pad = FALSE))) + pks <- sapply(which(shape < 0), FUN = function(i){ + z <- i - m + 1 + z <- ifelse(z > 0, z, 1) + w <- i + m + 1 + w <- ifelse(w < length(x), w, length(x)) + if(all(x[c(z : i, (i + 2) : w)] <= x[i + 1])) return(i + 1) else return(numeric(0)) + }) + pks <- unlist(pks) + pks + } + + local_maxes <- find_maxima(d[["roll_y"]]) + fit_max <- max(local_maxes) + as.integer(perc/2) + + #mu <- d[["x"]][which.max(d[["y"]])] + #max_y <- max(d[["y"]]) + + # Set the maximum point in the density as the mean for the fitted Gaussian + mu <- d[["x"]][fit_max] # get max with respect to x) local maxima of rolling + max_y <- d[["y"]][fit_max] + cnt <- 0 + + while ( (max_y < 0.5*max(d[["y"]])) && (cnt < 5) ) { # while selected local max y is less than 50% of actual maximum + cnt <- cnt + 1 + perc <- as.integer((0.05-(cnt*0.01))*length(d[["y"]]) + 1) # rm 1 percent from roll avg interval per iteration + + #d[["roll_y"]] <- filter(data.frame(d[["y"]]), f_2perc, sides=2) + d[["roll_y"]] <- zoo::rollmean(d[["y"]], perc) + local_maxes[local_maxes < max(local_maxes)] + fit_max <- max(local_maxes) + as.integer(perc/2) + # Set the maximum point in the density as the mean for the fitted Gaussian + #mu <- d[["x"]][which.max(d[["y"]])] + mu <- d[["x"]][fit_max] # get max with respect to x) local maxima of rolling + max_y <- d[["y"]][fit_max] + } + + if ( (max_y < 0.1*max(d[["y"]])) ) { + mu <- d[["x"]][which.max(d[["y"]])] + max_y <- max(d[["y"]]) # if doesnt work use regular zFPKM calculation + } + + + + # standard deviation from right side + #U <- mean(log_abundance[log_abundance > mu]) + #stdev <- (U - mu) * sqrt(pi / 2) + # standard deviation from left side + U <- mean(log_abundance[log_abundance < mu & log_abundance > 0]) + stdev <- (mu - U) * sqrt(pi / 2) + + # Compute ztransform + z <- (log_abundance - mu) / stdev + + result <- z_result(z, d, mu, stdev, max_y) + + return(result) +} + + +plot_gaussian_fit <- function(results, FacetTitles=TRUE, PlotXfloor) { + + df <- data.frame() + + for (name in names(results)) { + result <- results[[name]] + d <- result[["d"]] + mu <- result[["m"]] + stdev <- result[["s"]] + max_y <- result[["max_y"]] + + fitted <- dnorm(d[["x"]], mean=mu, sd=stdev) + max_abundance <- max_y + max_fit <- max(fitted) + + scale_fit <- fitted * (max_abundance / max_fit) + + new_df <- data.frame(sample_name=name, log_abundance=d[["x"]], abundance_density=d[["y"]], + fitted_density_scaled=scale_fit) + + df <- df %>% dplyr::bind_rows(new_df) + } + + df_stack <- df %>% tidyr::gather(source, density, -c(log_abundance, sample_name)) + labels <- unique(df_stack$sample_name) + + maximum_x = max(df_stack[["log_abundance"]]) + #maximum_y = max(d[["y"]]) + + p <- ggplot2::ggplot(df_stack, ggplot2::aes(x=log_abundance, y=density, color=source)) + + #ggplot2::facet_wrap(~ sample_name) + + ggplot2::facet_wrap(vars(sample_name)) + + ggplot2::geom_line(alpha=0.7) + + ggplot2::theme_bw() + + ggplot2::labs(x="log2(abundance)", y="[scaled] density") + + ggplot2::theme(legend.position="top") + + ggplot2::xlim(PlotXfloor, maximum_x) + + print(p) +} + + +z_transform <- function(abundance_df, min_thresh) { + + abundance_df <- rm_infinite(abundance_df) + z_df <- data.frame(row.names=row.names(abundance_df)) + outputs <- list() + for (c in colnames(abundance_df)) { + output <- z_score_calc(abundance_df[, c], min_thresh) + z_df[, c] <- output[["z"]] + outputs[[c]] <- output + } + + return(list(outputs, z_df)) +} + + +z_score_plot <- function(abundance_df, min_thresh, FacetTitles=FALSE, PlotXfloor=-20) { + plot_gaussian_fit(z_transform(abundance_df, min_thresh)[[1]], FacetTitles, PlotXfloor) +} + + +rm_infinite <- function(abundance) { + # Remove FPKM rows containing all NaN values. These are most likely a result + # of effective lengths = 0 when calculating FPKM. + abundance <- as.data.frame(abundance) + return(abundance[which(!apply(abundance, 1, function(r) all(is.nan(r) | is.infinite(r)))), ]) +} + + +protein_transform_main <- function(abundance_matrix, out_dir, group_name) { + dir.create(file.path(out_dir, "figures"), showWarnings = FALSE) + prot <- as.data.frame(readr::read_csv(abundance_matrix)) + prot[is.na(prot)] <- 0 + min_thresh <- min(prot>0) + #row.names(prot) <- prot$ENTREZ_GENE_ID + #prot["ENTREZ_GENE_ID"] <- NULL + pdf(file.path(out_dir, "figures", paste0("fit_proteinAbundance_", group_name, ".pdf"))) + z_score_plot(prot[,-1], 0, out_dir) + dev.off() + #if ( i ==1 ) {stop()} + minimums <- prot == 0 + nas <- is.na(prot)== 1 + z_transformed_abundances <- cbind(prot[,1], z_transform(prot[,c(-1)], 0)[[2]]) + z_transformed_abundances[minimums] <- -4 + #z_transformed_abundances[nas] <- -4 + colnames(z_transformed_abundances)[1] <- "ENTREZ_GENE_ID" + out_file <- file.path(out_dir, paste0("protein_zscore_Matrix_", group_name, ".csv")) + write.csv(z_transformed_abundances, out_file, row.names=FALSE) +} diff --git a/main/como/rscripts/rnaseq.R b/main/como/rscripts/rnaseq.R new file mode 100644 index 00000000..23b0a37b --- /dev/null +++ b/main/como/rscripts/rnaseq.R @@ -0,0 +1,538 @@ +suppressPackageStartupMessages(library("biomaRt")) +suppressPackageStartupMessages(library("dplyr")) +suppressPackageStartupMessages(library("edgeR")) +suppressPackageStartupMessages(library("genefilter")) +suppressPackageStartupMessages(library("limma")) +suppressPackageStartupMessages(library("readr")) +suppressPackageStartupMessages(library("readxl")) +suppressPackageStartupMessages(library("sjmisc")) +suppressPackageStartupMessages(library("stringr")) +suppressPackageStartupMessages(library("tidyverse")) +suppressPackageStartupMessages(library("zFPKM")) + +# Check if rlogs directory exists, From: https://stackoverflow.com/a/46008094 +# Then prevent messy messages from repeatedly writing to juypter +work_dir <- getwd() +r_log_directory <- stringr::str_interp("${work_dir}/logs") +if (!dir.exists(r_log_directory)) { dir.create(r_log_directory) } +zz <- file(file.path(r_log_directory, "rnaseq.Rout"), open = "wt") +sink(zz, type = "message") + +# Create a variable x set equal to 1 + + +read_counts_matrix <- function(counts_matrix_filepath, config_filepath, info_filepath, context_name) { + config_object <- readxl::read_excel(config_filepath, sheet = context_name) + counts_matrix <- data.frame(readr::read_csv(file = counts_matrix_filepath)) %>% dplyr::arrange(., ensembl_gene_id) + gene_info <- data.frame(readr::read_csv(info_filepath)) %>% + dplyr::mutate(size = end_position - start_position) %>% # Calculate gene size + dplyr::arrange(.$ensembl_gene_id) %>% + dplyr::filter(.$entrez_gene_id != "-") %>% # Remove un-named genes + dplyr::filter(.$ensembl_gene_id %in% counts_matrix$ensembl_gene_id) # Only keep genes that are in counts_matrix + + # Select rows where "genes" in counts_matrix are also present in gene_info + counts_matrix <- counts_matrix[(counts_matrix$ensembl_gene_id %in% gene_info$ensembl_gene_id),] + entrez_genes <- gene_info$entrez_gene_id + + # remove version numbers from ensembl id + for (i in seq_along(entrez_genes)) { + row <- entrez_genes[i] + if (grepl("\\\\.", row)) { + gen <- unlist(stringr::str_split(row, "\\\\."))[1] + entrez_genes[i] <- gen + } + } + + + # initialize groups and pre-allocate space + sample_metrics <- list() + num_samples <- length(config_object$sample_name) + groups <- unique(config_object$study) + for (group in groups) { + sample_metrics[[group]] <- list( + CountMatrix = matrix(data = NA, nrow = nrow(counts_matrix), ncol = num_samples), + FragmentLengths = numeric(length = num_samples), + SampleNames = character(length = num_samples), + Layout = character(length = num_samples) + ) + } + + # add to group count matrices and insert lists + for (i in seq_along(config_object$sample_name)) { + entry <- config_object$sample_name[i] # CELL-TYPE_S##R## + group <- config_object$study[i] + + # Test if `entry` is not in counts_matrix columns + if (!(entry %in% colnames(counts_matrix))) { + print(paste(entry, "not found in count matrix.")) + next + } + + # In case insert_size is NULL, set it to 0 + fragment_length_value <- ifelse(is.na(config_object$fragment_length[i]), 0, config_object$fragment_length[i]) + + # update sample_metrics with current replicate values + col_index <- which(colnames(counts_matrix) == entry) # Use column index for faster access + sample_metrics[[group]][["CountMatrix"]][, i] <- as.numeric(counts_matrix[, col_index]) + sample_metrics[[group]][["FragmentLengths"]][i] <- fragment_length_value + sample_metrics[[group]][["SampleNames"]][i] <- entry + sample_metrics[[group]][["Layout"]][i] <- config_object$layout[i] + } + + for (group in groups) { # for each study/batch group + samp_mat <- sample_metrics[[group]][["CountMatrix"]] + samp_mat <- apply(samp_mat, FUN = as.numeric, MARGIN = 2, simplify = FALSE) # '2' means apply over columns + write.csv(samp_mat, file = "/home/joshl/projects/COMO/main/samp_mat.csv") + samp_mat <- t(samp_mat) + + colnames(samp_mat) <- sample_metrics[[group]][["SampleNames"]] # set column names to sample names + sample_metrics[[group]][["CountMatrix"]] <- samp_mat # update counts matrix + sample_metrics[[group]][["NumSamples"]] <- ncol(samp_mat) # set number of samples + sample_metrics[[group]][["Entrez"]] <- as.character(gene_info$entrez_gene_id) # store entrez ids + sample_metrics[[group]][["GeneSizes"]] <- gene_info$size # store gene size + sample_metrics[[group]][["StudyNumber"]] <- group + } + return(sample_metrics) +} + + +calculate_tpm <- function(sample_metrics) { + + for (i in seq_along(sample_metrics)) { + count_matrix <- sample_metrics[[i]][["CountMatrix"]] + gene_size <- sample_metrics[[i]][["GeneSizes"]] + tpm_matrix <- do.call(cbind, lapply(seq_len(ncol(count_matrix)), function(j) { + rate <- log(count_matrix[, j]) - log(gene_size[j]) + denom <- log(sum(exp(rate))) + exp(rate - denom + log(1e6)) + })) + colnames(tpm_matrix) <- colnames(count_matrix) + sample_metrics[[i]][["TPM_Matrix"]] <- tpm_matrix + } + + return(sample_metrics) +} + + +calculate_fpkm <- function(sample_metrics) { + + for (i in seq_along(sample_metrics)) { + + layout <- sample_metrics[[i]][["Layout"]] # get layout + + if (layout[1] == "paired-end") { # fpkm + # Print components of sample_metrics[[i]], which is of type list + count_matrix <- sample_metrics[[i]][["CountMatrix"]] + gene_size <- sample_metrics[[i]][["GeneSizes"]] + mean_fragment_lengths <- sample_metrics[[i]][["FragmentLengths"]] + + fpkm_matrix <- matrix(nrow = nrow(count_matrix), ncol = ncol(count_matrix)) + print(class(count_matrix)) + print(head(colSums(count_matrix))) + for (j in seq_len(ncol(count_matrix))) { + eff_len <- gene_size - mean_fragment_lengths[j] + 1 # plus one to prevent div by 0 + + print(paste("j:", j)) + print(paste("ncol:", ncol(count_matrix))) + N <- sum(count_matrix[, j]) + + fpkm_matrix[, j] <- exp(log(count_matrix[, j]) + log(1e9) - log(eff_len) - log(N)) + } + fpkm_matrix[is.nan(fpkm_matrix)] <- 0 + colnames(fpkm_matrix) <- colnames(count_matrix) + sample_metrics[[i]][["FPKM_Matrix"]] <- fpkm_matrix + + } else if (layout[1] == "single-end" ) { # rpkm + count_matrix <- sample_metrics[[i]][["CountMatrix"]] + gene_size <- sample_metrics[[i]][["GeneSizes"]] + rpkm_matrix <- do.call(cbind, lapply(seq_len(ncol(count_matrix)), function(j) { + rate <- log(count_matrix[, j]) - log(gene_size[j]) + exp(rate - log(sum(count_matrix[, j])) + log(1e9)) + })) + rpkm_matrix[is.nan(rpkm_matrix)] <- 0 + colnames(rpkm_matrix) <- colnames(count_matrix) + sample_metrics[[i]][["FPKM_Matrix"]] <- rpkm_matrix + } else { + stop(paste0("Invalid layout for sample '", sample_metrics[[i]][["StudyNumber"]], "'. Must be paired-end or single-end")) + } + } + return(sample_metrics) +} + +calculate_z_score <- function(sample_metrics, norm_tech) { + + for (i in seq_along(sample_metrics)) { + if (norm_tech == "CPM") { + tmat <- sample_metrics[[i]][["CPM_Matrix"]] + } else if (norm_tech == "TPM") { + tmat <- sample_metrics[[i]][["TPM_Matrix"]] + } + zmat <- matrix(nrow = nrow(tmat), ncol = ncol(tmat)) + + for (j in seq_len(ncol(tmat))) { + tvec <- tmat[, j] + logvec <- log2(tvec) + logvec[is.infinite(logvec)] <- NA + zvec <- scale(logvec, center = TRUE, scale = TRUE) + zmat[, j] <- zvec + } + zmat <- data.frame(zmat) + colnames(zmat) <- colnames(tmat) + sample_metrics[[i]][["Zscore"]] <- zmat + } + + return(sample_metrics) +} + + +cpm_filter <- function(sample_metrics, filt_options, context_name, prep) { + + N_exp <- filt_options$replicate_ratio + N_top <- filt_options$replicate_ratio_high + min.count <- filt_options$min_count + for (i in seq_along(sample_metrics)) { + + study_number <- sample_metrics[[i]][["StudyNumber"]] + counts <- sample_metrics[[i]][["CountMatrix"]] + ent <- sample_metrics[[i]][["Entrez"]] + size <- sample_metrics[[i]][["GeneSizes"]] + lib.size <- colSums(counts) + CPM <- edgeR::cpm(counts, lib.size = lib.size) + cpm_fname <- file.path(work_dir, "data", "results", context_name, prep, paste0("CPM_Matrix_", prep, "_", study_number, ".csv")) + write_cpm <- cbind(ent, CPM) + write.csv(write_cpm, cpm_fname, row.names = FALSE) + + min.samples <- round(N_exp * ncol(counts)) + top.samples <- round(N_top * ncol(counts)) + test_bools <- data.frame(gene = ent) + + for (j in seq_len(ncol(CPM))) { + cutoff <- ifelse(min.count == "default", + 10e6 / (median(sum(counts[, j]))), + 1e6 * min.count / (median(sum(counts[, j])))) + test_bools <- cbind(test_bools, as.integer(CPM[, j] > cutoff)) + } + + test_bools["gene"] <- NULL + f1 <- genefilter::kOverA(min.samples, 0.9) + flist <- genefilter::filterfun(f1) + keep <- genefilter::genefilter(test_bools, flist) + sample_metrics[[i]][["Entrez"]] <- ent[keep] + sample_metrics[[i]][["GeneSizes"]] <- size[keep] + sample_metrics[[i]][["CountMatrix"]] <- counts[keep,] + sample_metrics[[i]][["CPM_Matrix"]] <- CPM[keep,] + + f1_top <- genefilter::kOverA(top.samples, 0.9) + flist_top <- genefilter::filterfun(f1_top) + keep_top <- genefilter::genefilter(test_bools, flist_top) + + sample_metrics[[i]][["Entrez_hc"]] <- ent[keep_top] + } + + sample_metrics <- calculate_z_score(sample_metrics, "CPM") + + return(sample_metrics) +} + + +TPM_quant_filter <- function(sample_metrics, filt_options, context_name, prep) { + + N_exp <- filt_options$replicate_ratio + N_top <- filt_options$replicate_ratio_high + quant <- filt_options$quantile + sample_metrics <- calculate_tpm(sample_metrics) + + for (i in seq_along(sample_metrics)) { + + study_number <- sample_metrics[[i]][["StudyNumber"]] + counts <- sample_metrics[[i]][["CountMatrix"]] + ent <- sample_metrics[[i]][["Entrez"]] + size <- sample_metrics[[i]][["GeneSizes"]] + tpm <- sample_metrics[[i]][["TPM_Matrix"]] + tpm_fname <- file.path(work_dir, "data", "results", context_name, prep, paste0("TPM_Matrix_", prep, "_", study_number, ".csv")) + write_tpm <- cbind(ent, tpm) + write.csv(write_tpm, tpm_fname, row.names = FALSE) + + min.samples <- round(N_exp * ncol(tpm)) + top.samples <- round(N_top * ncol(tpm)) + test_bools <- data.frame(gene = ent) + + for (j in seq_len(ncol(tpm))) { + tpm_q <- tpm[, j] + tpm_q <- tpm_q[tpm_q > 0] + q_cutoff <- quantile(tpm_q, prob = 1 - quant / 100) + #q_cutoff_top <- quantile(tpm_q, prob=1-perc_top/100) + #bools <- data.frame(as.integer(tpm[,j]>q_cutoff)) + #bools_top <- data.frame(as.integer(tpm[,j]>q_cutoff_top)) + test_bools <- cbind(test_bools, as.integer(tpm[, j] > q_cutoff)) + } + + test_bools["gene"] <- NULL + f1 <- genefilter::kOverA(min.samples, 0.9) + flist <- genefilter::filterfun(f1) + keep <- genefilter::genefilter(test_bools, flist) + sample_metrics[[i]][["Entrez"]] <- ent[keep] + sample_metrics[[i]][["GeneSizes"]] <- size[keep] + sample_metrics[[i]][["CountMatrix"]] <- counts[keep,] + sample_metrics[[i]][["TPM_Matrix"]] <- tpm[keep,] + + f1_top <- genefilter::kOverA(top.samples, 0.9) + flist_top <- genefilter::filterfun(f1_top) + keep_top <- genefilter::genefilter(test_bools, flist_top) + + sample_metrics[[i]][["Entrez_hc"]] <- ent[keep_top] + } + + sample_metrics <- calculate_z_score(sample_metrics, "TPM") + + return(sample_metrics) +} + + +zfpkm_filter <- function(sample_metrics, filt_options, context_name, prep) { + N_exp <- filt_options$replicate_ratio # ratio replicates for active + N_top <- filt_options$replicate_ratio_high # ratio of replicates for high-confidence + cutoff <- filt_options$min_zfpkm + + sample_metrics <- calculate_fpkm(sample_metrics) + for (i in seq_along(sample_metrics)) { + study_number <- sample_metrics[[i]][["StudyNumber"]] + + entrez_ids <- sample_metrics[[i]][["Entrez"]] # get entrez ids + fpkm_matrix <- sample_metrics[[i]][["FPKM_Matrix"]] # get fpkm matrix + fpkm_df <- data.frame(fpkm_matrix) # convert to df + fpkm_df[rowSums(fpkm_df[]) > 0,] + fpkm_filename <- file.path(work_dir, "data", "results", context_name, prep, paste0("FPKM_Matrix_", prep, "_", study_number, ".csv")) + write_fpkm <- cbind(entrez_ids, fpkm_df) + colnames(write_fpkm)[1] <- "ENTREZ_GENE_ID" + write.csv(write_fpkm, fpkm_filename, row.names = FALSE) + + minimums <- fpkm_df == 0 + na_values <- is.na(fpkm_df) == 1 + + # calculate zFPKM + zmat <- zFPKM::zFPKM(fpkm_df, min_thresh = 0, assayName = "FPKM") + zmat[minimums] <- -4 # instead of -inf set to lower limit + + zfpkm_fname <- file.path(work_dir, "data", "results", context_name, prep, paste0("zFPKM_Matrix_", prep, "_", study_number, ".csv")) + write_zfpkm <- dplyr::bind_cols(entrez_ids, zmat) + colnames(write_zfpkm)[1] <- "ENTREZ_GENE_ID" + write.csv(write_zfpkm, zfpkm_fname, row.names = FALSE) + + zfpkm_plot_dir <- file.path(work_dir, "data", "results", context_name, prep, "figures") + # zfpkm_plot_dir <- file.path("/home", username, "main", "data", "results", context_name, prep, "figures") + if (!file.exists(zfpkm_plot_dir)) { + dir.create(zfpkm_plot_dir) + } + + zfpkm_plotname <- file.path(zfpkm_plot_dir, paste0("zFPKM_plot_", study_number, ".pdf")) + pdf(zfpkm_plotname) + zFPKM::zFPKMPlot(fpkm_df, min_thresh = min(fpkm_df), assayName = "FPKM") + dev.off() + + min.samples <- round(N_exp * ncol(zmat)) # min number of samples for active + top.samples <- round(N_top * ncol(zmat)) # top number of samples for high-confidence + + # active genes + f1 <- genefilter::kOverA(min.samples, cutoff) + flist <- genefilter::filterfun(f1) + keep <- genefilter::genefilter(zmat, flist) + sample_metrics[[i]][["Entrez"]] <- entrez_ids[keep] + + # top percentile genes + f1_top <- genefilter::kOverA(top.samples, cutoff) + flist_top <- genefilter::filterfun(f1_top) + keep_top <- genefilter::genefilter(zmat, flist_top) + sample_metrics[[i]][["Entrez_hc"]] <- entrez_ids[keep_top] + } + + return(sample_metrics) +} + + +umi_filter <- function(sample_metrics, filt_options, context_name) { + prep <- "scrna" + N_exp <- filt_options$replicate_ratio # ratio replicates for active + N_top <- filt_options$replicate_ratio_high # ratio of replicates for high-confidence + cutoff <- filt_options$min_zfpkm + + + #sample_metrics <- calculate_fpkm(sample_metrics) + for (i in seq_along(sample_metrics)) { + study_number <- sample_metrics[[i]][["StudyNumber"]] + + entrez_ids <- sample_metrics[[i]][["Entrez"]] # get entrez ids + count_matrix <- data.frame(sample_metrics[[i]][["CountMatrix"]]) + # Convert count matrix to numeric + count_matrix <- sapply(count_matrix, as.numeric) + count_matrix[rowSums(count_matrix[]) > 0,] + minimums <- count_matrix == 0 + + zmat <- zFPKM::zFPKM(count_matrix, min_thresh = 0, assayName = "UMI") + zmat[minimums] <- -4 # instead of -inf set to lower limit + + write_zumi <- dplyr::bind_cols(entrez_ids, zmat) + # write_zumi <- cbind(ent, zmat) + colnames(write_zumi)[1] <- "ENTREZ_GENE_ID" + zumi_fname <- file.path(work_dir, "data", "results", context_name, prep, paste0("zUMI_Matrix_", prep, "_", study_number, ".csv")) + write.csv(write_zumi, zumi_fname, row.names = FALSE) + + zumi_plot_dir <- file.path(work_dir, "data", "results", context_name, prep, "figures") + # zumi_plot_dir <- file.path("/home", username, "main", "data", "results", context_name, prep, "figures") + + if (!file.exists(zumi_plot_dir)) { + dir.create(zumi_plot_dir) + } + + batch_size <- 12 + plot_batches <- ceiling(ncol(count_matrix) / batch_size) + + if (plot_batches < 2) { + zumi_plotname <- file.path(zumi_plot_dir, paste0("zumi_plot_", study_number, ".pdf")) + pdf(zumi_plotname) + zFPKM::zFPKMPlot(count_matrix, min_thresh = min(count_matrix), assayName = "UMI") + dev.off() + + } else { + + for (j in 1:(plot_batches - 1)) { + zumi_plotname <- file.path(zumi_plot_dir, paste0("zumi_plot_", study_number, "_", j, ".pdf")) + pdf(zumi_plotname) + samps <- c() + jmin <- (batch_size * (j - 1)) + 1 + jmax <- batch_size * j + samps <- jmin:jmax + + while (samps[length(samps)] > ncol(count_matrix)) { + samps <- samps[seq_along(samps) - 1] + } + + zFPKM::zFPKMPlot( + count_matrix[, samps], + min_thresh = 0, + assayName = "UMI" + ) + dev.off() + } + } + + min.samples <- round(N_exp * ncol(zmat)) # min number of samples for active + top.samples <- round(N_top * ncol(zmat)) # top number of samples for high-confidence + + # active genes + f1 <- genefilter::kOverA(min.samples, cutoff) + flist <- genefilter::filterfun(f1) + keep <- genefilter::genefilter(zmat, flist) + sample_metrics[[i]][["Entrez"]] <- entrez_ids[keep] + + # top percentile genes + f1_top <- genefilter::kOverA(top.samples, cutoff) + flist_top <- genefilter::filterfun(f1_top) + keep_top <- genefilter::genefilter(zmat, flist_top) + + sample_metrics[[i]][["Entrez_hc"]] <- entrez_ids[keep_top] + + } + + return(sample_metrics) +} + + +filter_counts <- function(sample_metrics, technique, filt_options, context_name, prep) { + switch( + technique, + cpm = cpm_filter(sample_metrics, filt_options, context_name, prep), + zfpkm = zfpkm_filter(sample_metrics, filt_options, context_name, prep), + quantile = TPM_quant_filter(sample_metrics, filt_options, context_name, prep), + umi = umi_filter(sample_metrics, filt_options, context_name) + ) +} + +save_rnaseq_tests <- function( + counts_matrix_file, + config_filepath, + out_file, + info_file, + context_name, + prep = "total", + replicate_ratio = 0.5, + batch_ratio = 0.5, + replicate_ratio_high = 0.9, + batch_ratio_high = 0.9, + technique = "quantile", + quantile = 0.9, + min_count = 10, + min_zfpkm = -3 +) { + + # condense filter options + filt_options <- list() + filt_options$replicate_ratio <- replicate_ratio + filt_options$batch_ratio <- batch_ratio + filt_options$quantile <- quantile + filt_options$min_count <- min_count + filt_options$min_zfpkm <- min_zfpkm + filt_options$replicate_ratio_high <- replicate_ratio_high + filt_options$batch_ratio_high <- batch_ratio_high + + if (prep == "scrna") { + technique <- "umi" + print("Note: Single cell filtration does not normalize and assumes counts are counted with UMI") + } + + print("1") + sample_metrics <- read_counts_matrix(counts_matrix_file, config_filepath, info_file, context_name) # read count matrix + print("2") + entrez_all <- sample_metrics[[1]][["Entrez"]] #get entrez ids + + sample_metrics <- filter_counts(sample_metrics, technique, filt_options, context_name, prep) # normalize and filter count + print("3") + expressedGenes <- c() + topGenes <- c() + for (i in seq_along(sample_metrics)) { # get high confidence and expressed genes for each study/batch number + expressedGenes <- c(expressedGenes, sample_metrics[[i]][["Entrez"]]) + topGenes <- c(topGenes, sample_metrics[[i]][["Entrez_hc"]]) + } + print("4") + + expMat <- as.data.frame(table(expressedGenes)) # convert expression to df + topMat <- as.data.frame(table(topGenes)) # convert high confidence to df + nc <- length(sample_metrics) # number of columns + expMat <- cbind(expMat, "Prop" = expMat$Freq / nc) # calculate proportion of studies/batches expressed + topMat <- cbind(topMat, "Prop" = topMat$Freq / nc) # calculate proportion of studies/batch high-confidence + sample_metrics[["ExpressionMatrix"]] <- expMat # store expression matrix for saving + sample_metrics[["TopMatrix"]] <- topMat # store high confidence matrix for saving + print("8") + # get genes which are expressed and high-confidence according to use defined ratio + sample_metrics[["ExpressedGenes"]] <- as.character(expMat$expressedGenes[expMat$Prop >= batch_ratio]) + sample_metrics[["TopGenes"]] <- as.character(topMat$topGenes[topMat$Prop >= batch_ratio_high]) + print("9") + + # create a table to write gene expression and high confidence to + write_table <- data.frame(entrez_all) + write_table <- cbind(write_table, rep(0, nrow(write_table))) + for (i in seq_len(nrow(write_table))) { + if (as.character(write_table[i, 1]) %in% as.character(sample_metrics$ExpressedGenes)) { + write_table[i, 2] <- 1 + } + if (as.character(write_table$entrez_all[i]) %in% as.character(sample_metrics$TopGenes)) { + write_table[i, 3] <- 1 + } + } + print("10") + header <- c("ENTREZ_GENE_ID", "expressed", "high") + #write_table <- rbind(header, write_table) + colnames(write_table) <- header + write.csv(write_table, out_file, row.names = FALSE, col.names = FALSE) + print("11") + print(head(write_table)) +} + +save_rnaseq_tests( + counts_matrix_file = "/home/joshl/projects/COMO/main/data/data_matrices/naiveB/gene_counts_matrix_total_naiveB.csv", + config_filepath = "/home/joshl/projects/COMO/main/data/config_sheets/trnaseq_data_inputs_auto.xlsx", + out_file = "/home/joshl/projects/COMO/rnaseq_r_out.csv", + info_file = "/home/joshl/projects/COMO/main/data/gene_info.csv", + context_name = "naiveB", + technique = "zfpkm" +) \ No newline at end of file diff --git a/main/como/rscripts/transform.Rmd b/main/como/rscripts/transform.Rmd new file mode 100644 index 00000000..f8759078 --- /dev/null +++ b/main/como/rscripts/transform.Rmd @@ -0,0 +1,151 @@ +--- +title: "R Notebook" +output: html_notebook +--- + +```{r} +library(ggplot2) +library(tidyverse) +``` + + +```{r} +z_result <- function(z_vector, density, mu, stdev, max_y) { + z_res <- list( + z = z_vector, + d = density, + m = mu, + s = stdev, + max_y = max_y + ) + + class(z_vector) <- append(class(z_res), "zFPKM") + return(z_res) +} +``` + +```{r} +z_score_calc <- function(abundance) { + if (!is.numeric(abundance)) { + stop("argument 'fpkm' must be numeric") + } + + log_abundance_filt <- log(abundance[abundance>0], base=2) + log_abundance <- log(abundance, base=2) + + d <- density(log_abundance_filt) + mu <- d[["x"]][which.max(d[["y"]])] + max_y <- max(d[["y"]]) + + # standard deviation from right side + #U <- mean(log_abundance[log_abundance > mu]) + #stdev <- (U - mu) * sqrt(pi / 2) + # standard deviation from left side + U <- mean(log_abundance[log_abundance < mu & log_abundance > 0]) + stdev <- (mu - U) * sqrt(pi / 2) + + + # Compute ztransform + z <- (log_abundance - mu) / stdev + + + result <- z_result(z, d, mu, stdev, max_y) + + return(result) +} + +``` + +```{r} +plot_gaussian_fit <- function(results, FacetTitles=TRUE, PlotXfloor) { + + df <- data.frame() + + for (name in names(results)) { + result <- results[[name]] + d <- result[["d"]] + mu <- result[["m"]] + stdev <- result[["s"]] + max_y <- result[["max_y"]] + + fitted <- dnorm(d[["x"]], mean=mu, sd=stdev) + max_abundance <- max_y + max_fit <- max(fitted) + + scale_fit <- fitted * (max_abundance / max_fit) + + new_df <- data.frame(sample_name=name, log_abundance=d[["x"]], abundance_density=d[["y"]], + fitted_density_scaled=scale_fit) + + + df <- df %>% dplyr::bind_rows(new_df) + } + + df_stack <- df %>% tidyr::gather(source, density, -c(log_abundance, sample_name)) + labels <- unique(df_stack$sample_name) + + maximum_x = max(df_stack[["log_abundance"]]) + #maximum_y = max(d[["y"]]) + + p <- ggplot2::ggplot(df_stack, ggplot2::aes(x=log_abundance, y=density, color=source)) + + #ggplot2::facet_wrap(~ sample_name) + + ggplot2::facet_wrap(vars(sample_name)) + + ggplot2::geom_line(alpha=0.7) + + ggplot2::theme_bw() + + ggplot2::labs(x="log2(abundance)", y="[scaled] density") + + ggplot2::theme(legend.position="top") + + ggplot2::xlim(PlotXfloor, maximum_x) + + + print(p) +} +``` + +```{r} +z_transform <- function(abundance_df) { + + abundance_df <- rm_infinite(abundance_df) + + z_df <- data.frame(row.names=row.names(abundance_df)) + outputs <- list() + for (c in colnames(abundance_df)) { + output <- z_score_calc(abundance_df[, c]) + z_df[, c] <- output[["z"]] + outputs[[c]] <- output + } + + return(list(outputs, z_df)) +} +``` + +```{r} +z_score_plot <- function(abundance_df, FacetTitles=FALSE, PlotXfloor=-20) { + plot_gaussian_fit(z_transform(abundance_df)[[1]], FacetTitles, PlotXfloor) +} + +``` + +```{r} +rm_infinite <- function(fpkm) { + # Remove FPKM rows containing all NaN values. These are most likely a result + # of effective lengths = 0 when calculating FPKM. + return(fpkm[which(!apply(fpkm, 1, function(r) all(is.nan(r) | is.infinite(r)))), ]) +} +``` + +```{r} +setwd("/Users/joshl/Downloads/NormTransformTest") +dir.create("figures", showWarnings = FALSE) +prot <- read.csv("/Users/joshl/PycharmProjects/MADRID/pipelines/data/data_matrices/Naive/ProteomicsDataMatrix_Naive.csv") + +plot_batches <- ceiling(ncol(prot)-2)/6 + +for ( i in 1:plot_batches) { + png(file.path("figures", paste0("fit_prot_", i, ".png"))) + z_score_plot(prot[,c((i-1)*6+3, (i-1)*6+4, (i-1)*6+5, (i-1)*6+6, (i-1)*6+7, (i-1)*6+8)]) + dev.off() + #if ( i ==1 ) {stop()} +} +z_transformed_abundances <- z_transform(prot[,c(-1, -2)])[[2]] + +``` diff --git a/main/como/stats/__init__.py b/main/como/stats/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/main/como/stats/_two_sample.py b/main/como/stats/_two_sample.py new file mode 100644 index 00000000..d856ebeb --- /dev/null +++ b/main/como/stats/_two_sample.py @@ -0,0 +1,60 @@ +from abc import ABC, abstractmethod +from collections.abc import Mapping +from concurrent.futures import ProcessPoolExecutor, as_completed +from typing import ClassVar, Generic, Literal, TypeVar + +import numpy as np +import numpy.typing as npt +import pandas as pd + +T_BASE_SAMPLE = TypeVar("T_BASE_SAMPLE", bound="BaseTwoSample") +T_ALTERNATIVE = Literal["greater", "less", "two-sided"] +KS_RESULT = tuple[np.floating, np.floating, np.floating, np.int8] +MW_RESULT = tuple[np.floating, np.floating] +TEST_RESULT = TypeVar("TEST_RESULT", KS_RESULT, MW_RESULT) + +__all__ = ["BaseTwoSample"] + + +class BaseTwoSample(ABC, Generic[TEST_RESULT]): + _fields: ClassVar[dict[str, type]] + + @staticmethod + @abstractmethod + def _worker(a: npt.NDArray[np.floating], b: npt.NDArray[np.floating], **kwargs) -> TEST_RESULT: ... + + @property + @abstractmethod + def df(self) -> pd.DataFrame: + """DataFrame representation of the results. + + Returns: + A DataFrame with columns corresponding to the fields in `_fields`. + """ + ... + + @classmethod + def _run( + cls: type[T_BASE_SAMPLE], + df1: pd.DataFrame, + df2: pd.DataFrame, + cores: int = 1, + worker_kwargs: dict | None = None, + ) -> tuple[list[str], Mapping[str, npt.NDArray[np.float64 | np.uint8]]]: + all_reactions = list(set(df1.columns) & set(df2.columns)) + array_a = df1[all_reactions].to_numpy(dtype=np.float64, copy=False) + array_b = df2[all_reactions].to_numpy(dtype=np.float64, copy=False) + n = len(all_reactions) + + results = {field: np.empty(n, dtype=np.dtype(dtype)) for field, dtype in cls._fields.items()} + + with ProcessPoolExecutor(max_workers=cores) as pool: + futures = {pool.submit(cls._worker, array_a[:, i], array_b[:, i], **(worker_kwargs or {})): i for i in range(n)} + for future in as_completed(futures): + col_idx: int = futures[future] + res: KS_RESULT | MW_RESULT = future.result() + + for (field, _), value in zip(cls._fields.items(), res, strict=True): + results[field][col_idx] = value + + return all_reactions, results diff --git a/main/como/stats/fisher_exact_test.py b/main/como/stats/fisher_exact_test.py new file mode 100644 index 00000000..068eb4c2 --- /dev/null +++ b/main/como/stats/fisher_exact_test.py @@ -0,0 +1,101 @@ +from dataclasses import dataclass +from typing import Annotated, Literal + +import cobra +import numpy as np +import scipy + + +@dataclass(frozen=True, kw_only=True, slots=True) +class FisherExactTest: + """Evalute Fisher's Exact Test for reaction presence. + + Fisher's Exact Test is a non-parametric statistical test used to determine if there are nonrandom associations between two variables. + It is useful in metabolic modeling because it can help assess whether the presence or absence of certain reactions in a metabolic model + is independent of a specific condition or treatment without assuming the distribution of the data. + + To calculate the Fisher's Exact Test, execute :func:`FisherExactTest.run`, which will return an instance of :class:`FisherExactTest` + + References: + [SciPy](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.fisher_exact.html) + [Wikipedia](https://en.wikipedia.org/wiki/Fisher%27s_exact_test) + """ + + pathway: Annotated[str, "The pathway test"] + statistic: Annotated[float, "The odds ratio of the test"] + pvalue: Annotated[float, "The p-value of the test"] + a: Annotated[int, "Number of reactions in the pathway AND scenario model"] + b: Annotated[int, "Number of reactions in the pathway but NOT the scenario model"] + c: Annotated[int, "Number of reactions NOT in the pathway but ARE in the scenario model"] + d: Annotated[int, "Number of reactions NOT in the pathway NOR the scenario model"] + + @classmethod + def run( + cls: type["FisherExactTest"], + reference: cobra.Model, + scenario: cobra.Model, + pathway: str, + alternative: Literal["two-sided", "less", "greater"] = "two-sided", + ) -> "FisherExactTest": + """Perform a Fisher's Exact Test on two models with a known reference model. + + This test is based on the following assumptions: + - The general "reference" model was used to reconstruct the comprehensive and scenario models (such as Recon3D) + - A scenario-specific model exists that may not be representative of true biology + + --- + + Given the following contingency table for a set of conditions and N reactions: + - A: Reactions in `pathway` and the scenario-specific model + - B: Reactions in `pathway` but not the scenario-specific model + - C: Reactions in the scenario-specific model that are not a part of `pathway` + - D: Reactions not in `pathway` that are also not found in the scenario-specific model + + | Reaction Status | In scenario-specific model | Not in scenario-specific model | Row Total | + |:----------------:|:--------------------------:|:------------------------------:|:----------------------------:| + | In `pathway` | A | B | A + B | + | Not in `pathway` | C | D | C + D | + | Column Total | A + C | B + D | A + B + C + D (=N reactions) | + + A two-sided Fisher's exact test will ask the question: + > Is the inclusion or exclusion of this reaction in the patient model independent of its status in the reference model? + + If the scenario-specific dataset is "small", the reconstruction will likely have excluded/dropped some reactions + as a result of the limited data available. This means the Fisher's exact test may show **many apparent differences**. + However, this could be noise from undersampling and not indicative of the true underlying biology. + In practice, if only a few reactions fall into "condition A" (above), this suggests that the scenario-specific model is too sparse + and not reconstructed with enough data. + + Args: + reference: The general reference model that was used to build the model (e.g., Recon3D) + scenario: The scenario-specific model to test (e.g., built using a small cohort of single-cell RNA-seq data) + pathway: The pathway to investigate for a Fisher's Exact Test + alternative: The alternative hypothesis to test + + Returns: + The p-value indicating whether the reaction presence in the scenario model is independent of the reference model. + """ + scenario_rxn_ids: set[str] = {rxn.id for rxn in scenario.reactions} + + a = 0 # a reaction is in the given pathway and scenario model + b = 0 # a reaction is in the given pathway but not the scenario model + c = 0 # a reaction is not in the given pathway but is in the scenario model + d = 0 # a reaction is not in the given pathway OR the scenario model + + for rxn in reference.reactions: + in_pathway = rxn.subsystem == pathway + in_scenario = rxn.id in scenario_rxn_ids + + if in_pathway: + if in_scenario: + a += 1 + else: + b += 1 + else: + if in_scenario: + c += 1 + else: + d += 1 + + result = scipy.stats.fisher_exact(np.array([[a, b], [c, d]]), alternative=alternative) + return cls(statistic=result.statistic, pvalue=result.pvalue, pathway=pathway, a=a, b=b, c=c, d=d) diff --git a/main/como/stats/ks_test.py b/main/como/stats/ks_test.py new file mode 100644 index 00000000..9f3c1778 --- /dev/null +++ b/main/como/stats/ks_test.py @@ -0,0 +1,102 @@ +from dataclasses import dataclass +from typing import ClassVar, Literal + +import numpy as np +import numpy.typing as npt +import pandas as pd +from scipy.stats import ks_2samp + +from como.stats._two_sample import KS_RESULT, T_ALTERNATIVE, BaseTwoSample + +__all__ = ["KSTest"] + + +@dataclass(frozen=True, kw_only=True, slots=True) +class KSTest(BaseTwoSample[KS_RESULT]): + _fields: ClassVar[dict[str, type]] = { + "statistic": np.float64, + "pvalue": np.float64, + "statistic_location": np.float64, + "statistic_sign": np.uint8, + } + + reaction_ids: list[str] + statistic: npt.NDArray[np.float64] + pvalue: npt.NDArray[np.float64] + statistic_location: npt.NDArray[np.float64] + statistic_sign: npt.NDArray[np.int8] + + @staticmethod + def _worker(a: npt.NDArray[np.floating], b: npt.NDArray[np.floating], **kwargs) -> KS_RESULT: + """Calculate the KS statistic. + + Args: + a: First array + b: Second array + kwargs: Additional keyword arguments to pass to `ks_2samp` + + Returns: + A tuple of (statistic, pvalue, statistic_location, statistic_sign) + """ + res = ks_2samp(a, b, **kwargs) + return res.statistic, res.pvalue, res.statistic_location, res.statistic_sign + + @classmethod + def run( + cls, + df1: pd.DataFrame, + df2: pd.DataFrame, + alternative: T_ALTERNATIVE = "two-sided", + method: Literal["auto", "exact", "asymp"] = "auto", + axis: int = 0, + nan_policy: Literal["raise", "propagate", "omit"] = "propagate", + keepdims: bool = False, + cores: int = 1, + ) -> "KSTest": + """Run the KS test on two dataframes. + + Args: + df1: The first dataframe to process; obtained from running `cobra.sampling.sample`. + Columns should be reaction IDs and rows should be samples. + df2: The second dataframe to process; obtained from running `cobra.sampling.sample`. + Columns should be reaction IDs and rows should be samples. + alternative: The alternative hypothesis to test. + method: The method to use for calculating the p-value. + axis: The axis to perform the test along. + nan_policy: The policy to use for handling NaNs. + keepdims: Whether to keep the dimensions of the input arrays. + cores: The number of CPU cores to use for multiprocessing. + + Returns: + An instance of `KSTest` containing the results of the test. + """ + all_reactions, results = cls._run( + df1=df1, + df2=df2, + cores=cores, + worker_kwargs={"alternative": alternative, "method": method, "axis": axis, "nan_policy": nan_policy, "keepdims": keepdims}, + ) + return cls( + reaction_ids=all_reactions, + statistic=results["statistic"].astype(np.float64), + pvalue=results["pvalue"].astype(np.float64), + statistic_location=results["statistic_location"].astype(np.float64), + statistic_sign=results["statistic_sign"].astype(np.int8), + ) + + @property + def df(self) -> pd.DataFrame: + """DataFrame representation of the results. + + Returns: + A DataFrame with columns "statistic", "pvalue", "statistic_location", and "statistic_sign". + """ + return pd.DataFrame( + { + "statistic": self.statistic, + "pvalue": self.pvalue, + "statistic_location": self.statistic_location, + "statistic_sign": self.statistic_sign, + }, + index=pd.Index(name="reaction_id", data=self.reaction_ids), + ) diff --git a/main/como/stats/mann_whitney_test.py b/main/como/stats/mann_whitney_test.py new file mode 100644 index 00000000..98c8131d --- /dev/null +++ b/main/como/stats/mann_whitney_test.py @@ -0,0 +1,82 @@ +from dataclasses import dataclass +from typing import ClassVar, Literal + +import numpy as np +import numpy.typing as npt +import pandas as pd +from scipy.stats import PermutationMethod, mannwhitneyu + +from como.stats._two_sample import MW_RESULT, T_ALTERNATIVE, BaseTwoSample + +__all__ = ["MannWhitneyUTest"] + + +@dataclass(frozen=True, kw_only=True, slots=True) +class MannWhitneyUTest(BaseTwoSample[MW_RESULT]): + _fields: ClassVar[dict[str, type]] = {"statistic": np.float64, "pvalue": np.float64} + + reaction_ids: list[str] + statistic: npt.NDArray[np.float64] + pvalue: npt.NDArray[np.float64] + + @staticmethod + def _worker(a: npt.NDArray[np.floating], b: npt.NDArray[np.floating], **kwargs) -> MW_RESULT: + """Calculate the MWU statistic. + + Args: + a: First array + b: Second array + kwargs: Additional keyword arguments to pass to `mannwhitneyu` + + Returns: + A tuple of (statistic, pvalue) + """ + res = mannwhitneyu(x=a, y=b, **kwargs) + return np.float64(res.statistic), np.float64(res.pvalue) + + @classmethod + def run( + cls, + df1: pd.DataFrame, + df2: pd.DataFrame, + alternative: T_ALTERNATIVE = "two-sided", + use_continuity: bool = True, + axis: int = 0, + method: Literal["auto", "asymptotic", "exact"] | PermutationMethod = "auto", + cores: int = 1, + ) -> "MannWhitneyUTest": + """Run the MWU test on two dataframes. + + Args: + df1: The first dataframe to process; obtained from running `cobra.sampling.sample`. + Columns should be reaction IDs and rows should be samples. + df2: The second dataframe to process; obtained from running `cobra.sampling.sample`. + Columns should be reaction IDs and rows should be samples. + alternative: The alternative hypothesis to test. + use_continuity: Whether to apply a continuity correction when using the asymptotic method. + axis: The axis to perform the test along. + method: The method to use for calculating the p-value. + cores: The number of CPU cores to use for multiprocessing. + + Returns: + An instance of `MannWhitneyUTest` containing the results of the test. + """ + all_reactions, results = cls._run( + df1=df1, + df2=df2, + cores=cores, + worker_kwargs={"alternative": alternative, "use_continuity": use_continuity, "axis": axis, "method": method}, + ) + return cls(reaction_ids=all_reactions, statistic=results["statistic"].astype(np.float64), pvalue=results["pvalue"].astype(np.float64)) + + @property + def df(self) -> pd.DataFrame: + """DataFrame representation of the results. + + Returns: + A DataFrame with columns "statistic" and "pvalue". + """ + return pd.DataFrame( + {"statistic": self.statistic, "pvalue": self.pvalue}, + index=pd.Index(name="reaction_id", data=self.reaction_ids), + ) diff --git a/main/como/utils.py b/main/como/utils.py new file mode 100644 index 00000000..a585dfb6 --- /dev/null +++ b/main/como/utils.py @@ -0,0 +1,280 @@ +from __future__ import annotations + +import asyncio +import contextlib +import io +import sys +import typing +from collections.abc import Iterator +from pathlib import Path +from typing import Literal, TextIO, overload + +import numpy.typing as npt +import pandas as pd +import scanpy as sc +from fast_bioservices import BioDBNet, Output, Taxon +from fast_bioservices.pipeline import ( + determine_gene_type, + ensembl_to_gene_id_and_symbol, + gene_id_to_ensembl_and_gene_symbol, + gene_symbol_to_ensembl_and_gene_id, +) +from loguru import logger + +from como.data_types import LOG_FORMAT, Algorithm, LogLevel + +__all__ = ["split_gene_expression_data", "stringlist_to_list", "suppress_stdout"] + + +def stringlist_to_list(stringlist: str | list[str]) -> list[str]: + """Convert a string from the command line into a Python list. + + In doing so, we must deprecate the use of the current method + + If '[' and ']' are present in the first and last items of the list, + assume we are using the "old" method of providing context names + + Args: + stringlist: The "string list" gathered from the command line. Example input: "['mat', 'xml', 'json']" + + Returns: + A list of strings. Example output: ['mat', 'xml', 'json'] + """ + if isinstance(stringlist, list): + return stringlist + + if not (stringlist.startswith("[") and stringlist.endswith("]")): + return stringlist.split(" ") + + # Remove any brackets from the first and last items; replace quotation marks and commas with nothing + new_list: list[str] = stringlist.strip("[]").replace("'", "").replace(" ", "").split(",") + + # Show a warning if more than one item is present in the list (this means we are using the old method) + logger.critical("DeprecationWarning: Please use the new method of providing context names, i.e. --output-filetypes 'type1 type2 type3'.") + logger.critical( + "If you are using COMO, this can be done by setting the 'context_names' variable to a " + "simple string separated by spaces. Here are a few examples!" + ) + logger.critical("context_names = 'cellType1 cellType2 cellType3'") + logger.critical("output_filetypes = 'output1 output2 output3'") + logger.critical("\nYour current method of passing context names will be removed in the future. Update your variables above accordingly!\n\n") + + return new_list + + +def split_gene_expression_data( + expression_data: pd.DataFrame, + identifier_column: Literal["ensembl_gene_id", "entrez_gene_id"], + recon_algorithm: Algorithm | None = None, + *, + ensembl_as_index: bool = True, +): + """Split the gene expression data into single-gene and multiple-gene names. + + Arg: + expression_data: The gene expression data to map + identifier_column: The column containing the gene identifiers, either 'ensembl_gene_id' + recon_algorithm: The recon algorithm used to generate the gene expression data + ensembl_as_index: Should the 'ensembl_gene_id' column be set as + + Returns: + A pandas DataFrame with the split gene expression data + """ + expression_data.columns = [c.lower() for c in expression_data.columns] + if recon_algorithm in {Algorithm.IMAT, Algorithm.TINIT}: + expression_data.rename(columns={"combine_z": "active"}, inplace=True) + + expression_data = expression_data[[identifier_column, "active"]] + single_gene_names = expression_data[~expression_data[identifier_column].astype(str).str.contains("//")] + multiple_gene_names = expression_data[expression_data[identifier_column].astype(str).str.contains("//")] + split_gene_names = multiple_gene_names.assign(ensembl_gene_id=multiple_gene_names[identifier_column].astype(str).str.split("///")).explode( + identifier_column + ) + + gene_expressions = pd.concat([single_gene_names, split_gene_names], axis=0, ignore_index=True) + if ensembl_as_index: + gene_expressions.set_index(identifier_column, inplace=True) + return gene_expressions + + +@contextlib.contextmanager +def suppress_stdout() -> Iterator[None]: + """Suppress stdout output from the current context. + + :return: The context manager + """ + with io.StringIO() as buffer: + try: + sys.stdout = buffer + yield + finally: + sys.stdout = sys.__stdout__ + + +async def format_determination( + biodbnet: BioDBNet, + *, + requested_output: Output | list[Output], + input_values: list[str], + taxon: Taxon, +) -> pd.DataFrame: + """Determine the data type of the given input values (i.e., Entrez Gene ID, Gene Symbol, etc.). + + Args: + biodbnet: The BioDBNet to use for determination + requested_output: The data type to generate (of type `Output`) + input_values: The input values to determine + taxon: The Taxon ID + + Returns: + A pandas DataFrame + """ + requested_output = [requested_output] if isinstance(requested_output, Output) else requested_output + coercion = (await biodbnet.db_find(values=input_values, output_db=requested_output, taxon=taxon)).drop(columns=["Input Type"]) + coercion.columns = pd.Index(["input_value", *[o.value.replace(" ", "_").lower() for o in requested_output]]) + return coercion + + +@overload +async def read_file(path: Path | io.StringIO, *, h5ad_as_df: Literal[False] = False, **kwargs) -> sc.AnnData | pd.DataFrame: ... + + +@overload +async def read_file(path: Path | io.StringIO | pd.DataFrame | sc.AnnData, *, h5ad_as_df: Literal[True] = True, **kwargs) -> pd.DataFrame: ... + + +@overload +async def read_file(path: None, *, h5ad_as_df: Literal[True] = True, **kwargs) -> None: ... + + +async def read_file(path, *, h5ad_as_df=True, **kwargs): + """Asynchronously read a filepath and return a pandas DataFrame. + + If the provided path is None, None will also be returned. + None may be provided to this function so that `asyncio.gather` can safely be used on all sources + (trna, mrna, scrna, proteomics) without needing to check if the user has provided those sources + + Args: + path: The path to read from + h5ad_as_df: If True and the file is an h5ad, return a DataFrame of the .X matrix instead of an AnnData object + kwargs: Additional arguments to pass to pandas.read_csv, pandas.read_excel, or scanpy.read_h5ad, depending on the filepath provided + + Returns: + None, or a pandas DataFrame or AnnData + """ + if isinstance(path, pd.DataFrame | sc.AnnData): + return path + if not path: + return None + + if isinstance(path, Path) and not path.exists(): + _log_and_raise_error(f"File {path} does not exist", error=FileNotFoundError, level=LogLevel.CRITICAL) + + # StringIO is used if a CSV file is read using open() directly + if isinstance(path, io.StringIO): + return pd.read_csv(path, **kwargs) + + match path.suffix: + case ".csv" | ".tsv" | ".txt" | ".tab": + if "sep" not in kwargs: + kwargs.setdefault("sep", "," if path.suffix == ".csv" else "\t") + return await asyncio.to_thread(pd.read_csv, path, **kwargs) + case ".xlsx" | ".xls": + return await asyncio.to_thread(pd.read_excel, path, **kwargs) + case ".h5ad": + adata: sc.AnnData = await asyncio.to_thread(sc.read_h5ad, path, **kwargs) + if h5ad_as_df: + df = adata.to_df().T + df.index.name = "gene_symbol" + df.reset_index(inplace=True, drop=False) + return df + return adata + case _: + _log_and_raise_error( + f"Unknown file extension '{path.suffix}'. Valid options are '.tsv', '.csv', '.xlsx', '.xls', or '.h5ad'", + error=ValueError, + level=LogLevel.CRITICAL, + ) + + +async def get_missing_gene_data(values: list[str] | pd.DataFrame, taxon_id: int | str | Taxon) -> pd.DataFrame: + if isinstance(values, list): + gene_type = await determine_gene_type(values) + if all(v == "gene_symbol" for v in gene_type.values()): + return await gene_symbol_to_ensembl_and_gene_id(values, taxon=taxon_id) + elif all(v == "ensembl_gene_id" for v in gene_type.values()): + return await ensembl_to_gene_id_and_symbol(ids=values, taxon=taxon_id) + elif all(v == "entrez_gene_id" for v in gene_type.values()): + return await gene_id_to_ensembl_and_gene_symbol(ids=values, taxon=taxon_id) + else: + logger.critical("Gene data must be of the same type (i.e., all Ensembl, Entrez, or Gene Symbols)") + raise ValueError("Gene data must be of the same type (i.e., all Ensembl, Entrez, or Gene Symbols)") + elif isinstance(values, pd.DataFrame): + if "gene_symbol" in values.columns: + return await get_missing_gene_data(values["gene_symbol"].tolist(), taxon_id=taxon_id) + elif "entrez_gene_id" in values.columns: + return await get_missing_gene_data(values["entrez_gene_id"].tolist(), taxon_id=taxon_id) + elif "ensembl_gene_id" in values.columns: + return await get_missing_gene_data(values["ensembl_gene_id"].tolist(), taxon_id=taxon_id) + else: + logger.critical("Unable to find 'gene_symbol', 'entrez_gene_id', or 'ensembl_gene_id' in the input matrix.") + raise ValueError("Unable to find 'gene_symbol', 'entrez_gene_id', or 'ensembl_gene_id' in the input matrix.") +{"code":"internal","msg":"git-diff-tree: context deadline exceeded","meta":{"cause":"*fmt.wrapError"}}