Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
61 commits
Select commit Hold shift + click to select a range
cc06df8
add basic Android dynamic extractor framework
xukunzh May 29, 2025
5415459
Add Frida log to capa analysis workflow
xukunzh Jun 4, 2025
8ed3cd1
Implement basic Frida JSONL output and parser
xukunzh Jun 13, 2025
afe17ed
Merge pull request #2 from xukunzh/FridaExtractor
xukunzh Jun 13, 2025
3c3fce1
Implement basic Frida JSONL output and parser
xukunzh Jun 13, 2025
31fad02
Revert "Implement basic Frida JSONL output and parser"
xukunzh Jun 15, 2025
fda4892
Add FROMAT_ANDROID
xukunzh Jun 18, 2025
c843822
Merge Mike's commit suggestion
xukunzh Jun 18, 2025
b03c7bd
Merge Mike's commit suggestion
xukunzh Jun 18, 2025
53d75ff
Integrate FridaExtractor into Capa
xukunzh Jun 18, 2025
4c681df
Use Pydantic models to validate these JSON blobs
xukunzh Jun 20, 2025
7886446
Add arguments handling
xukunzh Jun 23, 2025
28d28f9
Change to FORMAT_APK in common.py
xukunzh Jun 23, 2025
24fe942
Change to FORMAT_APK in extractor.py
xukunzh Jun 23, 2025
3c1bae7
Fix a AttributeError bug
xukunzh Jun 23, 2025
20839a0
Merge pull request #3 from xukunzh/FridaExtractor
xukunzh Jun 23, 2025
a1b8b11
Squash fix commits into one
xukunzh Jun 24, 2025
191bf03
Merge branch 'master' into FridaExtractor
xukunzh Jun 24, 2025
c28d1e2
Update the value type in Argument model
xukunzh Jun 27, 2025
25bd5c0
Merge pull request #4 from xukunzh/FridaExtractor
xukunzh Jun 27, 2025
25696a9
Auto-generate Frida hooks from Capa rules
xukunzh Jun 27, 2025
98391f3
Switch to use APIs JSON file
xukunzh Jul 2, 2025
1da2435
Merge pull request #5 from xukunzh/FridaExtractor
xukunzh Jul 11, 2025
63304d2
add Java native & static method support and update model with Pydantic
xukunzh Jul 11, 2025
4a015df
Update scripts/frida/hook_builder.py
xukunzh Jul 19, 2025
14a60a2
Merge pull request #6 from xukunzh/FridaExtractor
xukunzh Jul 19, 2025
0752417
Add native API hooking support
xukunzh Jul 19, 2025
601037c
Merge branch 'master' into FridaExtractor
xukunzh Jul 19, 2025
5aeb03f
Add missing changes from last PR
xukunzh Jul 19, 2025
346b0e3
Apply pre-commit formatting to existing code
xukunzh Jul 24, 2025
2cba84f
Merge pull request #7 from xukunzh/FridaExtractor
xukunzh Jul 24, 2025
fd859d4
Add complete script generation and reorganize templates folder
xukunzh Jul 24, 2025
79e72b2
Add changes from last PR's comments
xukunzh Jul 24, 2025
9e32651
Update README setup and workflow
xukunzh Jul 25, 2025
9f69908
Remove files from git tracking
xukunzh Jul 25, 2025
d22af94
Add back SELinux disable step
xukunzh Aug 6, 2025
41035b2
Fix typo
xukunzh Aug 6, 2025
8a3dab7
Merge pull request #8 from xukunzh/FridaExtractor
xukunzh Aug 6, 2025
69e2179
Add APK hashes support to Frida extractor
xukunzh Aug 9, 2025
14a2ab1
Require all fields in models
xukunzh Aug 9, 2025
f509479
Changed to get package_name from input
xukunzh Aug 12, 2025
94cf914
Automate Frida analysis workflow with frida-compile
xukunzh Aug 16, 2025
e88b4f7
Change to generate .ts script for now in manual workflow
xukunzh Aug 16, 2025
a5e144f
Update error handling
xukunzh Aug 17, 2025
69286a9
Fix a format issue with pre-commit and update gitignore
xukunzh Aug 17, 2025
b70d798
Delete previous JS main templete
xukunzh Aug 17, 2025
e4ff271
Add APK installation process
xukunzh Aug 20, 2025
0211407
Add auto emulator creation
xukunzh Aug 22, 2025
4e0f64c
Fix create_emulator
xukunzh Aug 22, 2025
d6438ab
Fix directory creation issue
xukunzh Aug 22, 2025
47a4dae
Fix root access detection issue
xukunzh Aug 22, 2025
edbf385
Merge pull request #9 from xukunzh/FridaExtractor
xukunzh Aug 26, 2025
3b4ac24
Change to exception raising and Replace print with logging
xukunzh Aug 26, 2025
0edbdc2
Update frida_api.json with lastest API list
xukunzh Aug 26, 2025
5db7cf6
Simplify and fix emulator setup details in setup.md
xukunzh Aug 26, 2025
09915d3
Make package name optional and add aapt APK extraction
xukunzh Aug 27, 2025
6bd19e5
Reorganize README and setup.md
xukunzh Aug 28, 2025
a38c397
Keep only automated setup in README
xukunzh Aug 28, 2025
cc40e39
Update capa and dependencies installation instructions
xukunzh Aug 28, 2025
4de0aa7
Update CHANGELOG
xukunzh Aug 29, 2025
ac50435
Merge branch 'frida-gsoc' into master
xukunzh Aug 29, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,9 @@
## master (unreleased)

### New Features
- add Frida dynamic analysis support for Android applications #2712 @xukunzh
- add FridaExtractor for processing Android runtime behavioral data
- add automated Android analysis workflow with emulator creation and script generation
- ci: add support for arm64 binary releases

### Breaking Changes
Expand Down
8 changes: 6 additions & 2 deletions capa/features/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,9 +439,10 @@ def get_value_str(self):
ARCH_I386 = "i386"
ARCH_AMD64 = "amd64"
ARCH_AARCH64 = "aarch64"
ARCH_ARM = "arm"
# dotnet
ARCH_ANY = "any"
VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_AARCH64, ARCH_ANY)
VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_AARCH64, ARCH_ARM, ARCH_ANY)


class Arch(Feature):
Expand Down Expand Up @@ -485,7 +486,8 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
FORMAT_PE = "pe"
FORMAT_ELF = "elf"
FORMAT_DOTNET = "dotnet"
VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET)
FORMAT_APK = "apk"
VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET, FORMAT_APK)
# internal only, not to be used in rules
FORMAT_AUTO = "auto"
FORMAT_SC32 = "sc32"
Expand All @@ -497,6 +499,7 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
FORMAT_FREEZE = "freeze"
FORMAT_RESULT = "result"
FORMAT_BINJA_DB = "binja_database"
FORMAT_FRIDA = "frida"
STATIC_FORMATS = {
FORMAT_SC32,
FORMAT_SC64,
Expand All @@ -514,6 +517,7 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
FORMAT_VMRAY,
FORMAT_FREEZE,
FORMAT_RESULT,
FORMAT_FRIDA,
}
FORMAT_UNKNOWN = "unknown"

Expand Down
Empty file.
141 changes: 141 additions & 0 deletions capa/features/extractors/frida/extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
from typing import Union, Iterator
from pathlib import Path

from capa.features.insn import API, Number
from capa.features.common import (
OS,
ARCH_ARM,
ARCH_I386,
ARCH_AMD64,
FORMAT_APK,
OS_ANDROID,
ARCH_AARCH64,
Arch,
Format,
String,
Feature,
)
from capa.features.address import NO_ADDRESS, Address, ThreadAddress, ProcessAddress, DynamicCallAddress, _NoAddress
from capa.features.extractors.frida.models import Call, FridaReport
from capa.features.extractors.base_extractor import (
CallHandle,
SampleHashes,
ThreadHandle,
ProcessHandle,
DynamicFeatureExtractor,
)


class FridaExtractor(DynamicFeatureExtractor):
"""
Frida dynamic analysis feature extractor for Android applications.

Processes JSON output from Frida instrumentation to extract behavioral features.
"""

def __init__(self, report: FridaReport):
super().__init__(
hashes=SampleHashes(
md5=report.hashes.md5.lower(),
sha1=report.hashes.sha1.lower(),
sha256=report.hashes.sha256.lower(),
)
)
self.report: FridaReport = report

def get_base_address(self) -> Union[_NoAddress, None]:
return NO_ADDRESS

def extract_global_features(self) -> Iterator[tuple[Feature, Address]]:
"""Basic global features"""
yield OS(OS_ANDROID), NO_ADDRESS

if self.report.processes:
process = self.report.processes[0]

if process.arch:
arch_mapping = {"arm64": ARCH_AARCH64, "arm": ARCH_ARM, "x64": ARCH_AMD64, "ia32": ARCH_I386}
capa_arch = arch_mapping.get(process.arch, process.arch)
yield Arch(capa_arch), NO_ADDRESS

yield Format(FORMAT_APK), NO_ADDRESS

def extract_file_features(self) -> Iterator[tuple[Feature, Address]]:
"""Basic file features"""
yield String(self.report.package_name), NO_ADDRESS

def get_processes(self) -> Iterator[ProcessHandle]:
"""Get all processes from the report"""
for process in self.report.processes:
addr = ProcessAddress(pid=process.pid, ppid=0)
yield ProcessHandle(address=addr, inner=process)

def extract_process_features(self, ph: ProcessHandle) -> Iterator[tuple[Feature, Address]]:
# TODO: we have not identified process-specific features for Frida yet
yield from []

def get_process_name(self, ph: ProcessHandle) -> str:
return ph.inner.package_name

def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
"""Get all threads by grouping calls by thread_id"""
thread_ids = set()
for call in ph.inner.calls:
thread_ids.add(call.thread_id)

for tid in thread_ids:
addr = ThreadAddress(process=ph.address, tid=tid)
yield ThreadHandle(address=addr, inner={"tid": tid})

def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[tuple[Feature, Address]]:
# TODO: we have not identified thread-specific features for Frida yet
yield from []

def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]:
"""Get all API calls in a specific thread"""
for call in ph.inner.calls:
if call.thread_id == th.address.tid:
addr = DynamicCallAddress(thread=th.address, id=call.call_id)
yield CallHandle(address=addr, inner=call)

def extract_call_features(
self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
) -> Iterator[tuple[Feature, Address]]:
"""Extract features from individual API calls"""
call: Call = ch.inner

yield API(call.api_name), ch.address

if call.arguments:
for arg_obj in call.arguments:
arg_value = arg_obj.value
if isinstance(arg_value, (int, float, bool)):
yield Number(arg_value), ch.address
elif isinstance(arg_value, str):
yield String(arg_value), ch.address

def get_call_name(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> str:
"""Format API call name and parameters"""
call: Call = ch.inner

parts = []
parts.append(call.api_name)
parts.append("(")

if call.arguments:
args_display = []
for arg_obj in call.arguments:
display_value = str(arg_obj.value)
# Current implementation: Display name=value, since we have arg name
args_display.append(f"{arg_obj.name}={display_value}")
parts.append(", ".join(args_display))

parts.append(")")

return "".join(parts)

@classmethod
def from_jsonl_file(cls, jsonl_path: Path) -> "FridaExtractor":
"""Entry point: Create an extractor from a JSONL file"""
report = FridaReport.from_jsonl_file(jsonl_path)
return cls(report)
96 changes: 96 additions & 0 deletions capa/features/extractors/frida/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import json
from typing import List, Union

from pydantic import Field, BaseModel, ConfigDict


class FlexibleModel(BaseModel):
model_config = ConfigDict(extra="allow")


class Hashes(BaseModel):
md5: str
sha1: str
sha256: str


class Metadata(FlexibleModel):
process_id: int
package_name: str
arch: str
platform: str
hashes: Hashes


class Argument(FlexibleModel):
"""Represents a single argument in an API call"""

name: str
value: Union[str, int, float, bool, None]


class Call(FlexibleModel):
"""Represents a single API call captured by Frida"""

api_name: str # API name like "java.io.File.<init>", not sure if need to seperate 'japi' 'napi' 'jni'...
process_id: int
thread_id: int
call_id: int
# timestamp: Optional[str] = None
arguments: List[Argument] = Field(default_factory=list)
# return_value: Optional[Any] = None # Not very sure if we should use str as the return value type
# caller: Optional[str] = None


class Process(FlexibleModel):
"""Process information from Frida analysis"""

# ppid is omitted here as Android apps are usually single-process; it will be set to 0 in extractor.py
pid: int
package_name: str
arch: str
platform: str
calls: List[Call] = Field(default_factory=list)


class FridaReport(FlexibleModel):
"""Main report structure for Android analysis"""

# TODO: Some more file-level information may go here
package_name: str
processes: List[Process] = Field(default_factory=list)
hashes: Hashes

@classmethod
def from_jsonl_file(cls, jsonl_path) -> "FridaReport":
"""Load from JSON Lines file"""
metadata = None
api_calls = []

with open(jsonl_path, "r") as f:
content = f.read()
for line in content.splitlines():
record = json.loads(line)

if "metadata" in record:
metadata = Metadata(**record["metadata"])
elif "api" in record:
if "java_api" in record["api"]:
call = Call(**record["api"]["java_api"])
api_calls.append(call)
elif "native_api" in record["api"]:
call = Call(**record["api"]["native_api"])
api_calls.append(call)
Comment on lines +70 to +83
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

medium

Reading the entire file into memory with f.read() can be inefficient for large JSONL files. It's better to iterate over the file line by line to reduce memory consumption. This change also adds encoding='utf-8' for robustness and handles empty or malformed JSON lines.

        with open(jsonl_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip()
                if not line:
                    continue

                try:
                    record = json.loads(line)
                except json.JSONDecodeError:
                    continue

                if "metadata" in record:
                    metadata = Metadata(**record["metadata"])
                elif "api" in record:
                    if "java_api" in record["api"]:
                        call = Call(**record["api"]["java_api"])
                        api_calls.append(call)
                    elif "native_api" in record["api"]:
                        call = Call(**record["api"]["native_api"])
                        api_calls.append(call)


if metadata is None:
raise ValueError("No metadata found in JSONL file")

process = Process(
pid=metadata.process_id,
package_name=metadata.package_name,
arch=metadata.arch,
platform=metadata.platform,
calls=api_calls,
)

return cls(package_name=metadata.package_name, processes=[process], hashes=metadata.hashes)
15 changes: 14 additions & 1 deletion capa/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@
FORMAT_CAPE,
FORMAT_SC32,
FORMAT_SC64,
FORMAT_FRIDA,
FORMAT_VMRAY,
FORMAT_DOTNET,
FORMAT_FREEZE,
Expand All @@ -63,7 +64,7 @@
# CAPE (.json, .json_, .json.gz)
# DRAKVUF (.log, .log.gz)
# VMRay (.zip)
EXTENSIONS_DYNAMIC = ("json", "json_", "json.gz", "log", ".log.gz", ".zip")
EXTENSIONS_DYNAMIC = ("json", "json_", "json.gz", "log", ".log.gz", ".zip", "jsonl")
EXTENSIONS_BINEXPORT2 = ("BinExport", "BinExport2")
EXTENSIONS_ELF = "elf_"
EXTENSIONS_FREEZE = "frz"
Expand Down Expand Up @@ -225,6 +226,9 @@ def get_format_from_report(sample: Path) -> str:
# CAPE report that's missing the "CAPE" key,
# which is not going to be much use, but its correct.
return FORMAT_CAPE
elif sample.name.endswith(".jsonl"):
# TODO: Find out a way to classify frida later.
return FORMAT_FRIDA

return FORMAT_UNKNOWN

Expand Down Expand Up @@ -325,6 +329,15 @@ def log_empty_sandbox_report_error(error: str, sandbox_name: str):
logger.error("-" * 80)


def log_unsupported_frida_report_error(error: str):
logger.error("-" * 80)
logger.error(" Input file is not a valid Frida report: %s", error)
logger.error(" ")
logger.error(" capa currently supports analyzing Frida dynamic analysis reports in JSONL format.")
logger.error(" Please make sure your report file was generated by the Frida java_monitor.js script.")
logger.error("-" * 80)


def log_unsupported_os_error():
logger.error("-" * 80)
logger.error(" Input file does not appear to target a supported OS.")
Expand Down
13 changes: 13 additions & 0 deletions capa/loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
import capa.features.extractors.common
import capa.features.extractors.base_extractor
import capa.features.extractors.cape.extractor
import capa.features.extractors.frida.extractor
from capa.rules import RuleSet
from capa.engine import MatchResults
from capa.exceptions import UnsupportedOSError, UnsupportedArchError, UnsupportedFormatError
Expand All @@ -52,6 +53,7 @@
FORMAT_CAPE,
FORMAT_SC32,
FORMAT_SC64,
FORMAT_FRIDA,
FORMAT_VMRAY,
FORMAT_DOTNET,
FORMAT_DRAKVUF,
Expand Down Expand Up @@ -79,6 +81,7 @@
BACKEND_FREEZE = "freeze"
BACKEND_BINEXPORT2 = "binexport2"
BACKEND_IDA = "ida"
BACKEND_FRIDA = "frida"


class CorruptFile(ValueError):
Expand Down Expand Up @@ -351,6 +354,11 @@ def get_extractor(

return capa.features.extractors.ida.extractor.IdaFeatureExtractor()

elif backend == BACKEND_FRIDA:
import capa.features.extractors.frida.extractor

return capa.features.extractors.frida.extractor.FridaExtractor.from_jsonl_file(input_path)

else:
raise ValueError("unexpected backend: " + backend)

Expand Down Expand Up @@ -422,6 +430,11 @@ def get_file_extractors(input_file: Path, input_format: str) -> list[FeatureExtr
elif input_format == FORMAT_BINEXPORT2:
file_extractors = _get_binexport2_file_extractors(input_file)

elif input_format == FORMAT_FRIDA:
import capa.features.extractors.frida.extractor

file_extractors.append(capa.features.extractors.frida.extractor.FridaExtractor.from_jsonl_file(input_file))

return file_extractors


Expand Down
Loading