release: v1.0.2

- 124 sources (+1 xposedornot, bgpview replaced with ripestat) - Fix gravatar MD5 transform, fofa base64 query encoding - Fix misp_search URL resolution, threatconnect HMAC placeholder - Fix spycloud, duckduckgo, mailboxlayer/numverify/ipstack/ipinfodb endpoints - Fix DeHashEngine v1→v2, DorkEngine engine label, backup_endpoints consumed - Fix Retry-After HTTP-date parsing, Hashmob API schema, FIPS hashlib crash - Fix DB.close() event loop leak, _random_headers CH-UA override - Add query_transform mechanism (md5_lower, fofa_domain) - Lower scores: spyonweb, pipl_search, twitter_v2, hudsonrock rate_limit - Clean all internal tracking comments, fix Italian docstring
2026-06-08 16:07:17 +00:00 · 2026-04-14 21:18:30 +02:00
parent cf4428329e
commit 9bf66d3e50
26 changed files with 345 additions and 205 deletions
@@ -31,7 +31,6 @@ _PIVOT_TYPES = {"email", "username", "phone", "name", "ip", "domain"}


 def _cfg_depth(orc=None) -> int:
-    # A7/A10: read from orchestrator config if available
    if orc is not None:
        cfg = getattr(orc, "config", None)
        if cfg is not None:
@@ -46,7 +45,6 @@ def _cfg_depth(orc=None) -> int:


 def _cfg_concurrency(orc=None) -> int:
-    # A7: read from orchestrator config if available
    if orc is not None:
        cfg = getattr(orc, "config", None)
        if cfg is not None:
@@ -137,29 +135,24 @@ class AvalancheScanner:
    def __init__(self, orchestrator: "Orchestrator") -> None:
        self._orc             = orchestrator
        self.seen_assets: Set[str]  = set()
-        # A2: single semaphore for the entire run, created lazily inside the event loop
        self._sem: Optional[asyncio.Semaphore] = None
        self._all_records: List     = []
        self._dork_hits:   List[dict] = []
        self._seen_dork_urls: Set[str] = set()
-        # A6: scrape_hits merged atomically per _do_process call
        self._scrape_hits: Dict     = {"pastes": [], "credentials": [], "hashes": [],
                                       "telegram": [], "dork_misconfigs": []}
        self._max_depth: int        = 0
        self._in_flight: Dict[str, asyncio.Future] = {}
        self.pivot_log: List[dict]  = []
-        # A8: global set to prevent duplicate entries in discovered_assets
        self._seen_discovered: Set[str] = set()
        self.discovered_assets: List[dict] = []

    def _get_sem(self) -> asyncio.Semaphore:
-        # A2: semaphore created once per run, shared across all coroutines
        if self._sem is None:
            self._sem = asyncio.Semaphore(_cfg_concurrency(self._orc))
        return self._sem

    async def run(self, target: str) -> tuple:
-        # A9: respect no_pivot flag from config
        cfg = getattr(self._orc, "config", None)
        no_pivot = getattr(cfg, "no_pivot", False) if cfg else False
        if no_pivot:
@@ -196,7 +189,6 @@ class AvalancheScanner:
    async def _process(self, asset: str, depth: int,
                       parent: Optional[str], found_in: str) -> None:
        """Dedup gate: ensures each asset is processed exactly once."""
-        # A10: use per-run depth from orchestrator config
        if depth > _cfg_depth(self._orc):
            _syslog.debug("avalanche depth cap reached for %s", asset)
            return
@@ -205,7 +197,7 @@ class AvalancheScanner:
        if not key:
            return

-        # A1: add to seen_assets FIRST (atomic gate) before any other check.
+        # Add to seen_assets before any await to prevent concurrent duplicates.
        # If already present, wait on the in-flight future if one exists, then return.
        if key in self.seen_assets:
            if key in self._in_flight:
@@ -326,7 +318,8 @@ class AvalancheScanner:
            _syslog.warning("SCRAPE_FAIL asset=%s err=%s", asset, exc)
            scrape_res = {}

-        # A6: collect scrape results locally, then merge atomically
+        # Collect scrape results locally then merge into the shared dict.
+        # The event loop is single-threaded so the merge is safe without a lock.
        scrape_count = 0
        local_scrape: Dict = {k: [] for k in self._scrape_hits}
        for k in self._scrape_hits:
@@ -336,7 +329,7 @@ class AvalancheScanner:
                    item["pivot_depth"] = depth
                local_scrape[k].append(item)
                scrape_count += 1
-        # Atomic merge into shared dict (single-threaded event loop — safe)
+        # Merge into shared dict — safe within the single-threaded event loop.
        for k, items in local_scrape.items():
            self._scrape_hits[k].extend(items)
        _out("ok" if scrape_count else "dim",
@@ -393,7 +386,6 @@ class AvalancheScanner:
            queued.add(child_key)
            child_entry = {"asset": val, "qtype": vqtype, "found_in": phase, "ref": ref}
            children.append(child_entry)
-            # A8: prevent duplicate entries in discovered_assets across parallel parents
            if child_key not in self._seen_discovered:
                self._seen_discovered.add(child_key)
                self.discovered_assets.append({
@@ -412,12 +404,12 @@ class AvalancheScanner:
                self._process(val, depth + 1, parent=asset, found_in=phase)
            )

-        # A5: run child tasks FIRST, then append pivot_log so the log reflects actual outcomes
+        # Run child tasks before appending to pivot_log so the log reflects actual outcomes.
        if child_tasks:
            _out("info", f"{indent}  → reinjecting {len(child_tasks)} new asset(s)…")
            await asyncio.gather(*child_tasks, return_exceptions=True)

-        # ── Log this node (after children complete — A5) ──────────────
+        # ── Log this node ─────────────────────────────────────────────
        self.pivot_log.append({
            "asset":    asset,
            "qtype":    qtype,
@@ -461,8 +453,8 @@ class AvalancheScanner:
    # ── Scrape dispatcher ─────────────────────────────────────────────

    async def _async_scrape(self, asset: str) -> dict:
-        # A3: instantiate a fresh Session + ScrapeEngine per call to avoid sharing
-        # a non-thread-safe requests.Session / cloudscraper across concurrent coroutines.
+        # Instantiate a fresh Session and ScrapeEngine per call — requests.Session
+        # and cloudscraper are not safe to share across concurrent coroutines.
        _empty: dict = {"pastes": [], "credentials": [], "hashes": [],
                        "telegram": [], "dork_misconfigs": []}
        try:
@@ -517,8 +509,7 @@ async def _crack_and_inject(session, hash_value: str, record_ref,
    _out("ok", f"  [crack] {hash_value[:16]}… → {plaintext}  (from {parent_asset})")
    cracked_out.append(plaintext)

-    # A4: inject cracked plaintext as qtype="password" — NOT as username.
-    # Only pivot on it if sources support password-recycling queries.
+    # Inject the cracked plaintext as a password-recycling pivot seed.
    key = plaintext.lower()
    if key not in seen_assets and depth + 1 <= _cfg_depth(scanner._orc):
        await scanner._process(plaintext, depth + 1,