mandiant · xukunzh · May 29, 2025 · Jun 4, 2025 · Jun 13, 2025 · Jun 13, 2025
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,6 +3,9 @@
 ## master (unreleased)
 
 ### New Features
+- add Frida dynamic analysis support for Android applications #2712 @xukunzh
+- add FridaExtractor for processing Android runtime behavioral data
+- add automated Android analysis workflow with emulator creation and script generation
 - ci: add support for arm64 binary releases
 
 ### Breaking Changes

diff --git a/capa/features/common.py b/capa/features/common.py
@@ -439,9 +439,10 @@ def get_value_str(self):
 ARCH_I386 = "i386"
 ARCH_AMD64 = "amd64"
 ARCH_AARCH64 = "aarch64"
+ARCH_ARM = "arm"
 # dotnet
 ARCH_ANY = "any"
-VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_AARCH64, ARCH_ANY)
+VALID_ARCH = (ARCH_I386, ARCH_AMD64, ARCH_AARCH64, ARCH_ARM, ARCH_ANY)
 
 
 class Arch(Feature):
@@ -485,7 +486,8 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
 FORMAT_PE = "pe"
 FORMAT_ELF = "elf"
 FORMAT_DOTNET = "dotnet"
-VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET)
+FORMAT_APK = "apk"
+VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET, FORMAT_APK)
 # internal only, not to be used in rules
 FORMAT_AUTO = "auto"
 FORMAT_SC32 = "sc32"
@@ -497,6 +499,7 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
 FORMAT_FREEZE = "freeze"
 FORMAT_RESULT = "result"
 FORMAT_BINJA_DB = "binja_database"
+FORMAT_FRIDA = "frida"
 STATIC_FORMATS = {
     FORMAT_SC32,
     FORMAT_SC64,
@@ -514,6 +517,7 @@ def evaluate(self, features: "capa.engine.FeatureSet", short_circuit=True):
     FORMAT_VMRAY,
     FORMAT_FREEZE,
     FORMAT_RESULT,
+    FORMAT_FRIDA,
 }
 FORMAT_UNKNOWN = "unknown"
 

diff --git a/capa/features/extractors/frida/__init__.py b/capa/features/extractors/frida/__init__.py
diff --git a/capa/features/extractors/frida/extractor.py b/capa/features/extractors/frida/extractor.py
@@ -0,0 +1,141 @@
+from typing import Union, Iterator
+from pathlib import Path
+
+from capa.features.insn import API, Number
+from capa.features.common import (
+    OS,
+    ARCH_ARM,
+    ARCH_I386,
+    ARCH_AMD64,
+    FORMAT_APK,
+    OS_ANDROID,
+    ARCH_AARCH64,
+    Arch,
+    Format,
+    String,
+    Feature,
+)
+from capa.features.address import NO_ADDRESS, Address, ThreadAddress, ProcessAddress, DynamicCallAddress, _NoAddress
+from capa.features.extractors.frida.models import Call, FridaReport
+from capa.features.extractors.base_extractor import (
+    CallHandle,
+    SampleHashes,
+    ThreadHandle,
+    ProcessHandle,
+    DynamicFeatureExtractor,
+)
+
+
+class FridaExtractor(DynamicFeatureExtractor):
+    """
+    Frida dynamic analysis feature extractor for Android applications.
+
+    Processes JSON output from Frida instrumentation to extract behavioral features.
+    """
+
+    def __init__(self, report: FridaReport):
+        super().__init__(
+            hashes=SampleHashes(
+                md5=report.hashes.md5.lower(),
+                sha1=report.hashes.sha1.lower(),
+                sha256=report.hashes.sha256.lower(),
+            )
+        )
+        self.report: FridaReport = report
+
+    def get_base_address(self) -> Union[_NoAddress, None]:
+        return NO_ADDRESS
+
+    def extract_global_features(self) -> Iterator[tuple[Feature, Address]]:
+        """Basic global features"""
+        yield OS(OS_ANDROID), NO_ADDRESS
+
+        if self.report.processes:
+            process = self.report.processes[0]
+
+            if process.arch:
+                arch_mapping = {"arm64": ARCH_AARCH64, "arm": ARCH_ARM, "x64": ARCH_AMD64, "ia32": ARCH_I386}
+                capa_arch = arch_mapping.get(process.arch, process.arch)
+                yield Arch(capa_arch), NO_ADDRESS
+
+        yield Format(FORMAT_APK), NO_ADDRESS
+
+    def extract_file_features(self) -> Iterator[tuple[Feature, Address]]:
+        """Basic file features"""
+        yield String(self.report.package_name), NO_ADDRESS
+
+    def get_processes(self) -> Iterator[ProcessHandle]:
+        """Get all processes from the report"""
+        for process in self.report.processes:
+            addr = ProcessAddress(pid=process.pid, ppid=0)
+            yield ProcessHandle(address=addr, inner=process)
+
+    def extract_process_features(self, ph: ProcessHandle) -> Iterator[tuple[Feature, Address]]:
+        # TODO: we have not identified process-specific features for Frida yet
+        yield from []
+
+    def get_process_name(self, ph: ProcessHandle) -> str:
+        return ph.inner.package_name
+
+    def get_threads(self, ph: ProcessHandle) -> Iterator[ThreadHandle]:
+        """Get all threads by grouping calls by thread_id"""
+        thread_ids = set()
+        for call in ph.inner.calls:
+            thread_ids.add(call.thread_id)
+
+        for tid in thread_ids:
+            addr = ThreadAddress(process=ph.address, tid=tid)
+            yield ThreadHandle(address=addr, inner={"tid": tid})
+
+    def extract_thread_features(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[tuple[Feature, Address]]:
+        # TODO: we have not identified thread-specific features for Frida yet
+        yield from []
+
+    def get_calls(self, ph: ProcessHandle, th: ThreadHandle) -> Iterator[CallHandle]:
+        """Get all API calls in a specific thread"""
+        for call in ph.inner.calls:
+            if call.thread_id == th.address.tid:
+                addr = DynamicCallAddress(thread=th.address, id=call.call_id)
+                yield CallHandle(address=addr, inner=call)
+
+    def extract_call_features(
+        self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
+    ) -> Iterator[tuple[Feature, Address]]:
+        """Extract features from individual API calls"""
+        call: Call = ch.inner
+
+        yield API(call.api_name), ch.address
+
+        if call.arguments:
+            for arg_obj in call.arguments:
+                arg_value = arg_obj.value
+                if isinstance(arg_value, (int, float, bool)):
+                    yield Number(arg_value), ch.address
+                elif isinstance(arg_value, str):
+                    yield String(arg_value), ch.address
+
+    def get_call_name(self, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle) -> str:
+        """Format API call name and parameters"""
+        call: Call = ch.inner
+
+        parts = []
+        parts.append(call.api_name)
+        parts.append("(")
+
+        if call.arguments:
+            args_display = []
+            for arg_obj in call.arguments:
+                display_value = str(arg_obj.value)
+                # Current implementation: Display name=value, since we have arg name
+                args_display.append(f"{arg_obj.name}={display_value}")
+            parts.append(", ".join(args_display))
+
+        parts.append(")")
+
+        return "".join(parts)
+
+    @classmethod
+    def from_jsonl_file(cls, jsonl_path: Path) -> "FridaExtractor":
+        """Entry point: Create an extractor from a JSONL file"""
+        report = FridaReport.from_jsonl_file(jsonl_path)
+        return cls(report)
diff --git a/capa/features/extractors/frida/models.py b/capa/features/extractors/frida/models.py
@@ -0,0 +1,96 @@
+import json
+from typing import List, Union
+
+from pydantic import Field, BaseModel, ConfigDict
+
+
+class FlexibleModel(BaseModel):
+    model_config = ConfigDict(extra="allow")
+
+
+class Hashes(BaseModel):
+    md5: str
+    sha1: str
+    sha256: str
+
+
+class Metadata(FlexibleModel):
+    process_id: int
+    package_name: str
+    arch: str
+    platform: str
+    hashes: Hashes
+
+
+class Argument(FlexibleModel):
+    """Represents a single argument in an API call"""
+
+    name: str
+    value: Union[str, int, float, bool, None]
+
+
+class Call(FlexibleModel):
+    """Represents a single API call captured by Frida"""
+
+    api_name: str  # API name like "java.io.File.<init>", not sure if need to seperate 'japi' 'napi' 'jni'...
+    process_id: int
+    thread_id: int
+    call_id: int
+    # timestamp: Optional[str] = None
+    arguments: List[Argument] = Field(default_factory=list)
+    # return_value: Optional[Any] = None     # Not very sure if we should use str as the return value type
+    # caller: Optional[str] = None
+
+
+class Process(FlexibleModel):
+    """Process information from Frida analysis"""
+
+    # ppid is omitted here as Android apps are usually single-process; it will be set to 0 in extractor.py
+    pid: int
+    package_name: str
+    arch: str
+    platform: str
+    calls: List[Call] = Field(default_factory=list)
+
+
+class FridaReport(FlexibleModel):
+    """Main report structure for Android analysis"""
+
+    # TODO: Some more file-level information may go here
+    package_name: str
+    processes: List[Process] = Field(default_factory=list)
+    hashes: Hashes
+
+    @classmethod
+    def from_jsonl_file(cls, jsonl_path) -> "FridaReport":
+        """Load from JSON Lines file"""
+        metadata = None
+        api_calls = []
+
+        with open(jsonl_path, "r") as f:
+            content = f.read()
+            for line in content.splitlines():
+                record = json.loads(line)
+
+                if "metadata" in record:
+                    metadata = Metadata(**record["metadata"])
+                elif "api" in record:
+                    if "java_api" in record["api"]:
+                        call = Call(**record["api"]["java_api"])
+                        api_calls.append(call)
+                    elif "native_api" in record["api"]:
+                        call = Call(**record["api"]["native_api"])
+                        api_calls.append(call)
+
+        if metadata is None:
+            raise ValueError("No metadata found in JSONL file")
+
+        process = Process(
+            pid=metadata.process_id,
+            package_name=metadata.package_name,
+            arch=metadata.arch,
+            platform=metadata.platform,
+            calls=api_calls,
+        )
+
+        return cls(package_name=metadata.package_name, processes=[process], hashes=metadata.hashes)
diff --git a/capa/helpers.py b/capa/helpers.py
@@ -48,6 +48,7 @@
     FORMAT_CAPE,
     FORMAT_SC32,
     FORMAT_SC64,
+    FORMAT_FRIDA,
     FORMAT_VMRAY,
     FORMAT_DOTNET,
     FORMAT_FREEZE,
@@ -63,7 +64,7 @@
 # CAPE (.json, .json_, .json.gz)
 # DRAKVUF (.log, .log.gz)
 # VMRay (.zip)
-EXTENSIONS_DYNAMIC = ("json", "json_", "json.gz", "log", ".log.gz", ".zip")
+EXTENSIONS_DYNAMIC = ("json", "json_", "json.gz", "log", ".log.gz", ".zip", "jsonl")
 EXTENSIONS_BINEXPORT2 = ("BinExport", "BinExport2")
 EXTENSIONS_ELF = "elf_"
 EXTENSIONS_FREEZE = "frz"
@@ -225,6 +226,9 @@ def get_format_from_report(sample: Path) -> str:
             # CAPE report that's missing the "CAPE" key,
             # which is not going to be much use, but its correct.
             return FORMAT_CAPE
+    elif sample.name.endswith(".jsonl"):
+        # TODO: Find out a way to classify frida later.
+        return FORMAT_FRIDA
 
     return FORMAT_UNKNOWN
 
@@ -325,6 +329,15 @@ def log_empty_sandbox_report_error(error: str, sandbox_name: str):
     logger.error("-" * 80)
 
 
+def log_unsupported_frida_report_error(error: str):
+    logger.error("-" * 80)
+    logger.error(" Input file is not a valid Frida report: %s", error)
+    logger.error(" ")
+    logger.error(" capa currently supports analyzing Frida dynamic analysis reports in JSONL format.")
+    logger.error(" Please make sure your report file was generated by the Frida java_monitor.js script.")
+    logger.error("-" * 80)
+
+
 def log_unsupported_os_error():
     logger.error("-" * 80)
     logger.error(" Input file does not appear to target a supported OS.")

diff --git a/capa/loader.py b/capa/loader.py
@@ -41,6 +41,7 @@
 import capa.features.extractors.common
 import capa.features.extractors.base_extractor
 import capa.features.extractors.cape.extractor
+import capa.features.extractors.frida.extractor
 from capa.rules import RuleSet
 from capa.engine import MatchResults
 from capa.exceptions import UnsupportedOSError, UnsupportedArchError, UnsupportedFormatError
@@ -52,6 +53,7 @@
     FORMAT_CAPE,
     FORMAT_SC32,
     FORMAT_SC64,
+    FORMAT_FRIDA,
     FORMAT_VMRAY,
     FORMAT_DOTNET,
     FORMAT_DRAKVUF,
@@ -79,6 +81,7 @@
 BACKEND_FREEZE = "freeze"
 BACKEND_BINEXPORT2 = "binexport2"
 BACKEND_IDA = "ida"
+BACKEND_FRIDA = "frida"
 
 
 class CorruptFile(ValueError):
@@ -351,6 +354,11 @@ def get_extractor(
 
         return capa.features.extractors.ida.extractor.IdaFeatureExtractor()
 
+    elif backend == BACKEND_FRIDA:
+        import capa.features.extractors.frida.extractor
+
+        return capa.features.extractors.frida.extractor.FridaExtractor.from_jsonl_file(input_path)
+
     else:
         raise ValueError("unexpected backend: " + backend)
 
@@ -422,6 +430,11 @@ def get_file_extractors(input_file: Path, input_format: str) -> list[FeatureExtr
     elif input_format == FORMAT_BINEXPORT2:
         file_extractors = _get_binexport2_file_extractors(input_file)
 
+    elif input_format == FORMAT_FRIDA:
+        import capa.features.extractors.frida.extractor
+
+        file_extractors.append(capa.features.extractors.frida.extractor.FridaExtractor.from_jsonl_file(input_file))
+
     return file_extractors