NOX Framework v1.0.0

This commit is contained in:
nox-project
2026-04-07 10:17:43 +02:00
commit 913e764133
163 changed files with 15613 additions and 0 deletions
View File
+243
View File
@@ -0,0 +1,243 @@
"""
sources/helpers/config_handler.py — NOX Framework
Unified credential management via ~/.config/nox-cli/apikeys.json (XDG).
Priority: environment variable → apikeys.json → None
"""
from __future__ import annotations
import json
import os
from pathlib import Path
from typing import Dict, Optional
# ── Shared constant — import this everywhere instead of a raw string ───
UNIVERSAL_PLACEHOLDER = "INSERT_API_KEY_HERE"
# ── XDG config path ────────────────────────────────────────────────────
_CONFIG_DIR = Path(os.environ.get("XDG_CONFIG_HOME", Path.home() / ".config")) / "nox-cli"
_APIKEYS_FILE = _CONFIG_DIR / "apikeys.json"
# ── Complete service registry ──────────────────────────────────────────
# Format: key_name → {"display": str, "public": bool}
# public=True → no key needed, always active
# public=False → requires a real API key (goes into apikeys.json)
SERVICE_REGISTRY: Dict[str, Dict] = {
# ── Public / keyless ──────────────────────────────────────────────
"alienvault_otx_domain": {"display": "AlienVault OTX (Domain)", "public": True},
"alienvault_otx_ip": {"display": "AlienVault OTX (IP)", "public": True},
"alienvault_otx_malware": {"display": "AlienVault OTX (Malware)", "public": True},
"alienvault_otx_user": {"display": "AlienVault OTX (User)", "public": True},
"anubis_subdomains": {"display": "Anubis Subdomains", "public": True},
"bgpview_ip": {"display": "BGPView IP", "public": True},
"checkleaked": {"display": "CheckLeaked", "public": True},
"crt_sh": {"display": "crt.sh", "public": True},
"cve_search": {"display": "CVE Search", "public": True},
"cxsecurity": {"display": "CXSecurity", "public": True},
"duckduckgo_api": {"display": "Google / DDG Dorks", "public": True},
"emailrep_io": {"display": "EmailRep.io", "public": True},
"github_users": {"display": "GitHub Users", "public": True},
"gitlab_search": {"display": "GitLab Search", "public": True},
"gravatar": {"display": "Gravatar", "public": True},
"hackernews_user": {"display": "HackerNews User", "public": True},
"hackertarget_dnslookup": {"display": "HackerTarget DNS Lookup", "public": True},
"hackertarget_hostsearch": {"display": "HackerTarget Host Search", "public": True},
"hackertarget_reverseip": {"display": "HackerTarget Reverse IP", "public": True},
"hackertarget_whois": {"display": "WHOIS (HackerTarget)", "public": True},
"hudsonrock_osint": {"display": "HudsonRock OSINT", "public": True},
"ipapi_co": {"display": "ipapi.co", "public": True},
"ipinfo_io": {"display": "IPInfo.io", "public": True},
"ipvigilante": {"display": "IPVigilante", "public": True},
"keybase_lookup": {"display": "Keybase Lookup", "public": True},
"keybase_proofs": {"display": "Keybase Proofs", "public": True},
"maltiverse_ip": {"display": "Maltiverse IP", "public": True},
"npm_user": {"display": "NPM User", "public": True},
"packetstorm": {"display": "PacketStorm", "public": True},
"phishtank_check": {"display": "PhishTank", "public": True},
"pulsedive": {"display": "Pulsedive (Free)", "public": True},
"pypi_user": {"display": "PyPI User", "public": True},
"reddit_user": {"display": "Reddit User", "public": True},
"robtex_ip": {"display": "Robtex IP", "public": True},
"scamwatcher": {"display": "ScamWatcher", "public": True},
"social_scan": {"display": "Social Scan", "public": True},
"sublist3r_api": {"display": "Sublist3r API", "public": True},
"threatcrowd_domain": {"display": "ThreatCrowd (Domain)", "public": True},
"threatcrowd_email": {"display": "ThreatCrowd (Email)", "public": True},
"threatminer_domain": {"display": "ThreatMiner (Domain)", "public": True},
"threatminer_ip": {"display": "ThreatMiner (IP)", "public": True},
"urlscan_search": {"display": "URLScan.io", "public": True},
"vigilante_pw": {"display": "Vigilante.pw", "public": True},
"wayback_machine": {"display": "Wayback Machine", "public": True},
# ── Private / key-required ────────────────────────────────────────
"ABSTRACT_API_KEY": {"display": "Abstract Email Validation", "public": False},
"ABUSEIPDB_API_KEY": {"display": "AbuseIPDB", "public": False},
"ANYRUN_API_KEY": {"display": "Any.run", "public": False},
"BA_API_KEY": {"display": "BreachAware", "public": False},
"BD_API_KEY": {"display": "BreachDirectory", "public": False},
"BINARYEDGE_API_KEY": {"display": "BinaryEdge", "public": False},
"BING_API_KEY": {"display": "Bing Search API", "public": False},
"CENSYS_AUTH_BASE64": {"display": "Censys", "public": False},
"CIRCL_AUTH_BASE64": {"display": "CIRCL.lu PDNS", "public": False},
"CIT0DAY_API_KEY": {"display": "Cit0day", "public": False},
"CLEARBIT_API_KEY": {"display": "Clearbit Enrich", "public": False},
"CRIMINALIP_API_KEY": {"display": "CriminalIP", "public": False},
"DEHASHED_AUTH_BASE64": {"display": "Dehashed", "public": False},
"DNSDB_API_KEY": {"display": "DNSDB Passive DNS", "public": False},
"DT_AUTH_BASE64": {"display": "DomainTools WHOIS", "public": False},
"EXTREME_API_KEY": {"display": "Extreme IP Lookup", "public": False},
"FLP_API_KEY": {"display": "FraudLabsPro", "public": False},
"FOFA_API_KEY": {"display": "FOFA", "public": False},
"FOFA_EMAIL": {"display": "FOFA (account email)", "public": False},
"FULLCONTACT_API_KEY": {"display": "FullContact", "public": False},
"GITHUB_TOKEN": {"display": "GitHub (Code/Repo Search)", "public": False},
"GOOGLE_API_KEY": {"display": "Google Safe Browsing", "public": False},
"GOOGLE_CX_KEY": {"display": "Google Custom Search (API key)", "public": False},
"GOOGLE_CX_ID": {"display": "Google Custom Search (CX ID)", "public": False},
"GREYNOISE_API_KEY": {"display": "GreyNoise", "public": False},
"HASHES_API_KEY": {"display": "Hashes.org", "public": False},
"HIBP_API_KEY": {"display": "HaveIBeenPwned", "public": False},
"HIPPO_API_KEY": {"display": "EmailHippo", "public": False},
"HUNTER_API_KEY": {"display": "Hunter.io", "public": False},
"HYBRID_API_KEY": {"display": "Hybrid Analysis", "public": False},
"INTELX_API_KEY": {"display": "IntelX", "public": False},
"INTEZER_API_KEY": {"display": "Intezer", "public": False},
"IPDATA_API_KEY": {"display": "IPData.co", "public": False},
"IPGEO_API_KEY": {"display": "IPGeolocation.io", "public": False},
"IPINFODB_API_KEY": {"display": "IPInfoDB", "public": False},
"IPQS_API_KEY": {"display": "IPQualityScore", "public": False},
"IPSTACK_API_KEY": {"display": "IPStack", "public": False},
"JOE_API_KEY": {"display": "Joe Sandbox", "public": False},
"LEAKCHECK_API_KEY": {"display": "LeakCheck", "public": False},
"LEAKIX_API_KEY": {"display": "LeakIX", "public": False},
"LEAKSTATS_API_KEY": {"display": "LeakStats.pw", "public": False},
"MAILBOX_API_KEY": {"display": "Mailboxlayer", "public": False},
"MALSHARE_API_KEY": {"display": "MalShare", "public": False},
"METADEFENDER_API_KEY": {"display": "MetaDefender", "public": False},
"MISP_API_KEY": {"display": "MISP", "public": False},
"NUMVERIFY_API_KEY": {"display": "Numverify", "public": False},
"ONYPHE_API_KEY": {"display": "Onyphe", "public": False},
"PASSIVETOTAL_AUTH_BASE64": {"display": "PassiveTotal / RiskIQ", "public": False},
"PIPL_API_KEY": {"display": "Pipl", "public": False},
"PULSEDIVE_API_KEY": {"display": "Pulsedive (Premium)", "public": False},
"RF_TOKEN": {"display": "Recorded Future", "public": False},
"SECURITYTRAILS_API_KEY": {"display": "SecurityTrails", "public": False},
"SHODAN_API_KEY": {"display": "Shodan", "public": False},
"SNUSBASE_API_KEY": {"display": "Snusbase", "public": False},
"SPYCLOUD_API_KEY": {"display": "SpyCloud", "public": False},
"SPYONWEB_API_KEY": {"display": "SpyOnWeb", "public": False},
"SPYSE_API_KEY": {"display": "Spyse", "public": False},
"TC_API_KEY": {"display": "ThreatConnect", "public": False},
"TINES_API_KEY": {"display": "Tines Breach", "public": False},
"TP_API_KEY": {"display": "ThreatPortal", "public": False},
"TWITTER_BEARER_TOKEN": {"display": "Twitter / X API v2", "public": False},
"URLVOID_API_KEY": {"display": "URLVoid", "public": False},
"VIEWDNS_API_KEY": {"display": "ViewDNS", "public": False},
"VIRUSTOTAL_API_KEY": {"display": "VirusTotal", "public": False},
"VULNERS_API_KEY": {"display": "Vulners", "public": False},
"WF_API_KEY": {"display": "WhoisFreaks", "public": False},
"WHOISXML_API_KEY": {"display": "WhoisXML API", "public": False},
"WHOXY_API_KEY": {"display": "Whoxy WHOIS", "public": False},
"ZEROBOUNCE_API_KEY": {"display": "ZeroBounce", "public": False},
"ZOOMEYE_API_KEY": {"display": "ZoomEye", "public": False},
}
_PRIVATE_KEYS = {k: v for k, v in SERVICE_REGISTRY.items() if not v["public"]}
# ── Store helpers ──────────────────────────────────────────────────────
def _default_store() -> Dict[str, str]:
"""Return a dict of all private service keys set to UNIVERSAL_PLACEHOLDER."""
return {k: UNIVERSAL_PLACEHOLDER for k in _PRIVATE_KEYS}
def _write_store(data: Dict[str, str]) -> None:
"""Atomically write data to apikeys.json with chmod 0600."""
try:
_CONFIG_DIR.mkdir(mode=0o700, parents=True, exist_ok=True)
_CONFIG_DIR.chmod(0o700)
tmp = _APIKEYS_FILE.with_suffix(".tmp")
tmp.write_text(json.dumps(data, indent=4, sort_keys=True), encoding="utf-8")
tmp.replace(_APIKEYS_FILE)
_APIKEYS_FILE.chmod(0o600)
except PermissionError as exc:
raise RuntimeError(f"[config_handler] Cannot write {_APIKEYS_FILE}: {exc}") from exc
def _load_store() -> Dict[str, str]:
"""Load apikeys.json, creating it with defaults if absent. Self-heals on corrupt files."""
_CONFIG_DIR.mkdir(mode=0o700, parents=True, exist_ok=True)
_CONFIG_DIR.chmod(0o700)
if not _APIKEYS_FILE.exists():
print(" \033[92m[+]\033[0m Initializing NOX Environment in ~/.config/nox-cli/")
_write_store(_default_store())
return _default_store()
try:
text = _APIKEYS_FILE.read_text(encoding="utf-8").strip()
if not text:
raise json.JSONDecodeError("Empty file", "", 0)
data = json.loads(text)
if not isinstance(data, dict):
raise json.JSONDecodeError("Root is not a JSON object", text, 0)
# Back-fill keys added in newer versions
new_keys = {k: UNIVERSAL_PLACEHOLDER for k in _PRIVATE_KEYS if k not in data}
if new_keys:
data.update(new_keys)
_write_store(data)
return data
except json.JSONDecodeError:
bak = _APIKEYS_FILE.with_suffix(".json.bak")
_APIKEYS_FILE.rename(bak)
print(f"[!] Malformed apikeys.json detected — backed up to {bak.name} and reset to defaults.")
defaults = _default_store()
_write_store(defaults)
return defaults
except PermissionError as exc:
raise RuntimeError(f"[config_handler] Cannot read {_APIKEYS_FILE}: {exc}") from exc
# ── ConfigManager ──────────────────────────────────────────────────────
class ConfigManager:
"""
Unified API key manager.
Resolution order per key:
1. Environment variable (exact key name)
2. ~/.config/nox-cli/apikeys.json
3. Returns None if value equals UNIVERSAL_PLACEHOLDER or is absent
"""
_cache: Dict[str, Optional[str]] = {}
_store: Optional[Dict[str, str]] = None
@classmethod
def _get_store(cls) -> Dict[str, str]:
if cls._store is None:
cls._store = _load_store()
return cls._store
@classmethod
def get_key(cls, key_name: str) -> Optional[str]:
"""Return the configured value, or None if missing/placeholder."""
if key_name in cls._cache:
return cls._cache[key_name]
val = os.environ.get(key_name, "") or cls._get_store().get(key_name, "")
result = None if (not val or val == UNIVERSAL_PLACEHOLDER) else val
cls._cache[key_name] = result
return result
# Backward-compatible alias used by nox.py internals
get = get_key
@classmethod
def set(cls, key_name: str, value: str) -> None:
"""Persist a key to apikeys.json and update the in-memory cache."""
store = cls._get_store()
store[key_name] = value
_write_store(store)
cls._cache[key_name] = None if value == UNIVERSAL_PLACEHOLDER else value
@classmethod
def config_path(cls) -> Path:
return _APIKEYS_FILE
+119
View File
@@ -0,0 +1,119 @@
"""
sources/helpers/cracker.py
Resilient async hash cracker for NOX autoscan.
Detects MD5 / SHA1 / SHA256 / bcrypt hashes inside breach records,
fires background crack attempts against available APIs, and returns
results without ever blocking the main pivot pipeline.
"""
import asyncio
import logging
import re
from typing import List, Optional, Tuple
# C2: MD5 and NTLM share the same 32-char hex pattern.
# We list md5 first (most common in breach data) but also accept ntlm
# so callers can query NTLM-specific APIs when needed.
_PATTERNS: List[Tuple[str, re.Pattern]] = [
("bcrypt", re.compile(r"^\$2[aby]?\$\d{2}\$.{53}$")),
("sha256", re.compile(r"^[a-f0-9]{64}$", re.I)),
("sha1", re.compile(r"^[a-f0-9]{40}$", re.I)),
("md5", re.compile(r"^[a-f0-9]{32}$", re.I)),
# ntlm shares the 32-char hex pattern — detected as md5 first,
# but async_crack queries both md5 and ntlm APIs for 32-char hashes.
]
# Writes to ~/.config/nox-cli/logs/nox_system.log — never to terminal
_syslog = logging.getLogger("nox.system")
# Per-API timeout — each individual rainbow-table query budget
_API_TIMEOUT = 8
# Global crack budget — hard cap regardless of API count or response order
CRACK_TIMEOUT = 20
def detect_hash(value: str) -> Optional[str]:
"""Return hash type string if value matches a known hash pattern, else None."""
v = value.strip()
for htype, pat in _PATTERNS:
if pat.match(v):
return htype
return None
async def _query_api(session, url: str, fmt: str) -> Optional[str]:
"""Single API query — returns plaintext or None. Never raises."""
try:
import aiohttp
to = aiohttp.ClientTimeout(total=_API_TIMEOUT)
async with session.get(url, timeout=to) as resp:
if resp.status != 200:
return None
if fmt == "text":
text = (await resp.text()).strip()
# Reject empty, too-long, or obvious error responses
if not text or len(text) > 128:
return None
tl = text.lower()
if any(tl.startswith(p) for p in ("not found", "error", "invalid", "no result", "not in", "cmd5-error", "not exist", "code erreur", "erreur", "unknown")):
return None
return text
data = await resp.json(content_type=None)
return data.get("result") or data.get("plaintext") or data.get("plain") or None
except Exception:
return None
async def async_crack(session, hash_value: str, hash_type: str) -> Optional[str]:
"""
Query multiple rainbow-table APIs concurrently.
Returns first plaintext found, or None. bcrypt is skipped.
C1: create tasks upfront for cancellation, but await each via asyncio.shield
inside as_completed — no double wait_for wrapping.
C2: for 32-char hex (md5/ntlm ambiguity), also query NTLM-specific APIs.
Per-API timeout: 8s. Global budget: 20s (CRACK_TIMEOUT).
All tasks are cancelled as soon as the first result is found.
"""
if hash_type == "bcrypt":
return None
h = hash_value.strip().lower()
apis = [
(f"https://www.nitrxgen.net/md5db/{h}", "text"),
(f"https://hashes.com/en/api/hash?hash={h}", "json"),
(f"https://hash.help/api/lookup/{h}", "json"),
(f"https://hashkiller.io/api/search.php?hash={h}", "json"),
(f"https://md5decrypt.net/Api/api.php?hash={h}&hash_type={hash_type}&email=&code=", "text"),
(f"https://www.cmd5.org/api.ashx?hash={h}", "text"),
]
# C2: for 32-char hashes (md5/ntlm ambiguous), add NTLM-specific endpoint
if hash_type == "md5" and len(h) == 32:
apis.append((f"https://hashes.com/en/api/hash?hash={h}&type=ntlm", "json"))
# C1: create tasks so we can cancel them; shield each before passing to wait_for
# so cancellation of the shield future does not cancel the underlying task prematurely.
tasks = [asyncio.create_task(_query_api(session, url, fmt)) for url, fmt in apis]
result: Optional[str] = None
try:
for fut in asyncio.as_completed(tasks):
try:
res = await asyncio.wait_for(asyncio.shield(fut), timeout=_API_TIMEOUT)
except (asyncio.TimeoutError, asyncio.CancelledError):
continue
except Exception:
continue
if res:
result = res
break
except Exception:
pass
finally:
# Cancel all remaining tasks and await to suppress pending-task warnings
for t in tasks:
if not t.done():
t.cancel()
await asyncio.gather(*[t for t in tasks if not t.done()], return_exceptions=True)
return result
+658
View File
@@ -0,0 +1,658 @@
"""
sources/helpers/reporting.py
NOX Enterprise Reporting — Executive Summary, Pivot Chain, Data Sanitization.
"""
import hashlib
import html as _html
import json
import re
from datetime import datetime
from pathlib import Path
from typing import Any, Dict, List
# ── Noise patterns stripped from all report output ────────────────────
_NOISE_RE = re.compile(
r"(Traceback \(most recent|File \".*\.py\"|TimeoutError|ProxyError"
r"|ConnectionError|aiohttp\.|ClientConnector|ssl\.|asyncio\."
r"|Task exception|NoneType|Object of type)",
re.I,
)
_CTRL_RE = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1f\x7f-\x9f]")
def _nox_ver() -> str:
try:
from nox import VERSION # type: ignore
return VERSION
except ImportError:
return "1.0.0"
def _clean(v: Any, maxlen: int = 200) -> str:
"""Strip control chars, technical noise, HTML-escape, truncate."""
s = str(v) if v is not None else ""
s = _CTRL_RE.sub("", s)
if _NOISE_RE.search(s):
return ""
return _html.escape(s[:maxlen])
def _raw(v: Any, maxlen: int = 200) -> str:
"""Strip control chars only — no HTML escaping (PDF / plain-text paths)."""
s = str(v) if v is not None else ""
s = _CTRL_RE.sub("", s)
if _NOISE_RE.search(s):
return ""
return s[:maxlen]
def _pdf_safe(s: str, maxlen: int = 180) -> str:
# D4: sanitize for fpdf2 core fonts (latin-1 subset).
# NFKD normalization decomposes accented chars (é→e + combining accent)
# so common accented Latin characters survive as their base letter.
# Truly non-latin-1 chars (Cyrillic, CJK, etc.) become '?' — intentional:
# fpdf2 core fonts cannot render them and would raise UnicodeEncodeError.
s = _raw(s, maxlen)
try:
import unicodedata
normalized = unicodedata.normalize("NFKD", s)
return normalized.encode("ascii", errors="replace").decode("ascii")
except Exception:
return s.encode("latin-1", errors="replace").decode("latin-1")
def _rget(r: Any, k: str) -> str:
if isinstance(r, dict):
return str(r.get(k, "") or "")
return str(getattr(r, k, "") or "")
# ── Executive summary builder ─────────────────────────────────────────
def build_exec_summary(data: dict) -> dict:
"""
Returns a dict with all dashboard KPIs needed by every format.
Expects data keys: records, analysis, scan_meta (optional).
"""
records = data.get("records", [])
meta = data.get("scan_meta", {}) or {}
analysis = data.get("analysis", {}) or {}
cleartext = sum(1 for r in records if _rget(r, "password"))
nodes = len({_rget(r, "email") or _rget(r, "username") for r in records} - {""})
elapsed = meta.get("elapsed_seconds")
depth = meta.get("pivot_depth", len(data.get("pivot_chain", [])))
buckets: Dict[str, int] = {"Critical": 0, "High": 0, "Medium": 0, "Low": 0, "Info": 0}
for r in records:
rs = float(_rget(r, "risk_score") or 0)
if rs >= 90: buckets["Critical"] += 1
elif rs >= 70: buckets["High"] += 1
elif rs >= 40: buckets["Medium"] += 1
elif rs >= 10: buckets["Low"] += 1
else: buckets["Info"] += 1
return {
"total_records": len(records),
"nodes_discovered": nodes,
"cleartext_passwords": cleartext,
"pivot_depth": depth,
"elapsed": f"{elapsed:.1f}s" if elapsed is not None else "N/A",
"buckets": buckets,
"hvt_count": analysis.get("hvt_count", sum(1 for r in records if getattr(r, "is_hvt", False))),
}
# ── Pivot chain renderer ──────────────────────────────────────────────
def render_pivot_chain(data: dict) -> List[str]:
"""
Build a human-readable pivot chain.
D2: check pivot_log first before falling back to record-based reconstruction.
"""
chain = data.get("pivot_chain") or []
target = _raw(data.get("target", "?"))
# D2: if pivot_log is available, build chain from it (accurate tree)
pivot_log = data.get("pivot_log") or []
if pivot_log:
lines: List[str] = []
for e in pivot_log:
depth = e.get("depth", 0)
asset = _raw(e.get("asset", ""))
phase = _raw(e.get("found_in", e.get("source", "?")))
parent = _raw(e.get("parent") or "")
prefix = " " * depth
if depth == 0:
lines.append(f"[SEED] {asset}")
else:
lines.append(f"{prefix}└─ [{phase}] {asset}{parent}")
return lines if lines else [f"[SEED] {target} (no pivot data)"]
if len(chain) <= 1:
# No pivot data — reconstruct best-effort from records
records = data.get("records", [])
lines = [f"[SEED] {target}"]
seen: set = {target.lower()}
for r in records[:40]:
src = _raw(_rget(r, "source"))
em = _raw(_rget(r, "email"))
usr = _raw(_rget(r, "username"))
ident = em or usr
if not ident or ident.lower() in seen:
continue
seen.add(ident.lower())
lines.append(f" └─ [{src}] → {ident}")
dork_results = data.get("dork_results") or []
for d in dork_results[:5]:
url = _raw(d.get("url", ""))
if url and url.lower() not in seen:
seen.add(url.lower())
lines.append(f" └─ [Dork] → {url[:80]}")
return lines if len(lines) > 1 else [f"[SEED] {target} (no pivot data)"]
# Ordered pivot chain from AvalancheScanner
lines = [f"[SEED] {_raw(chain[0])}"]
for node in chain[1:]:
lines.append(f" └─ [Pivot] → {_raw(node)}")
return lines
# ── JSON report ───────────────────────────────────────────────────────
def to_json(data: dict, path: str) -> None:
summary = build_exec_summary(data)
chain = render_pivot_chain(data)
records = data.get("records", [])
def _ser(o):
try:
from enum import Enum
if isinstance(o, Enum):
return o.name
except ImportError:
pass
if hasattr(o, "to_dict"):
return o.to_dict()
return str(o)
clean_records = []
for r in records:
d = r.to_dict() if hasattr(r, "to_dict") else (r if isinstance(r, dict) else {})
# drop noise fields
clean_records.append({
k: v for k, v in d.items()
if k not in ("raw_data", "metadata") and not _NOISE_RE.search(str(v or ""))
})
try:
from nox import VERSION as _NOX_VERSION # type: ignore
except ImportError:
_NOX_VERSION = "1.0.0"
# Include dork and scrape results in JSON output
dork_results = data.get("dork_results", []) or []
scrape_results = data.get("scrape_results", {}) or {}
# D3: apply consistent cap (1000) — same as HTML
_RECORD_CAP = 1000
out_data = {
"framework": f"NOX v{_NOX_VERSION}",
"generated": datetime.now().isoformat(),
"target": data.get("target", ""),
# J3: self-describing metadata block
"_meta": {
"scan_id": hashlib.sha256(
f"{data.get('target','')}{datetime.now().isoformat()}".encode()
).hexdigest()[:16],
"target": data.get("target", ""),
"timestamp": datetime.now().isoformat(),
"nox_version": _NOX_VERSION,
"sources_queried": summary.get("total_records", 0),
"pivot_depth_reached": summary.get("pivot_depth", 0),
"record_cap": _RECORD_CAP,
"truncated": len(clean_records) > _RECORD_CAP,
},
"executive_summary": summary,
"pivot_chain": chain,
"records": clean_records[:_RECORD_CAP],
"dork_results": dork_results,
"scrape_results": scrape_results,
}
Path(path).write_text(json.dumps(out_data, indent=2, default=_ser), encoding="utf-8")
print(f"[+] JSON report saved: {path}")
# ── HTML report ───────────────────────────────────────────────────────
_CSS = (
"*{margin:0;padding:0;box-sizing:border-box}"
"body{font-family:'Courier New',monospace;background:#0a0a0a;color:#e0e0e0;padding:20px}"
".hdr{text-align:center;padding:28px;border:1px solid #333;margin-bottom:18px;background:#111}"
".hdr h1{color:#00ff41;font-size:26px;letter-spacing:4px}"
".hdr p{color:#888;margin-top:5px;font-size:12px}"
".kpis{display:grid;grid-template-columns:repeat(auto-fit,minmax(160px,1fr));gap:10px;margin:14px 0}"
".kpi{background:#111;border:1px solid #333;padding:16px;text-align:center}"
".kpi .n{font-size:30px;font-weight:bold;color:#00ff41}"
".kpi .l{color:#888;font-size:10px;margin-top:3px}"
".kpi.warn .n{color:#ff6600} .kpi.crit .n{color:#ff0040}"
".sec{margin:18px 0} .sec h2{color:#00ff41;border-bottom:1px solid #333;padding-bottom:5px;margin-bottom:10px}"
".chain{background:#0d1a0d;border:1px solid #1a3a1a;padding:12px;font-size:11px;color:#00cc33;word-break:break-all;margin:8px 0}"
"table{width:100%;border-collapse:collapse} th,td{padding:7px;border:1px solid #222;font-size:11px;word-break:break-all}"
"th{background:#1a1a1a;color:#00ff41;text-transform:uppercase;font-size:10px} td{background:#0d0d0d}"
"tr.c td{background:#1a0005} tr.h td{background:#1a0a00} tr.m td{background:#1a1500}"
".pw{color:#ff0040;font-weight:bold}"
)
def to_html(data: dict, path: str) -> None:
summary = build_exec_summary(data)
chain = render_pivot_chain(data)
target = _clean(data.get("target", "Unknown"))
records = data.get("records", [])
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")
# KPI dashboard
kpis = (
f'<div class="kpi"><div class="n">{summary["total_records"]}</div><div class="l">TOTAL RECORDS</div></div>'
f'<div class="kpi"><div class="n">{summary["nodes_discovered"]}</div><div class="l">NODES DISCOVERED</div></div>'
f'<div class="kpi crit"><div class="n">{summary["cleartext_passwords"]}</div><div class="l">CLEARTEXT PASSWORDS</div></div>'
f'<div class="kpi warn"><div class="n">{summary["hvt_count"]}</div><div class="l">HIGH-VALUE TARGETS</div></div>'
f'<div class="kpi"><div class="n">{summary["pivot_depth"]}</div><div class="l">PIVOT DEPTH</div></div>'
f'<div class="kpi"><div class="n">{summary["elapsed"]}</div><div class="l">TOTAL TIME</div></div>'
)
# Severity table
sev_rows = "".join(
f"<tr><td>{lvl}</td><td>{cnt}</td></tr>"
for lvl, cnt in summary["buckets"].items() if cnt
)
# Pivot chain
chain_html = "".join(f'<div class="chain">{_clean(c)}</div>' for c in chain)
# Credential rows (top 500, noise-free)
cred_rows = ""
for r in records[:500]:
rs = float(_rget(r, "risk_score") or 0)
cls = "c" if rs >= 90 else "h" if rs >= 70 else "m" if rs >= 40 else ""
em = _clean(_rget(r, "email") or _rget(r, "username"))
pw = _clean(_rget(r, "password"))
src = _clean(_rget(r, "source"))
bd = _clean(_rget(r, "breach_date"))
hvt = "" if getattr(r, "is_hvt", False) or (isinstance(r, dict) and r.get("is_hvt")) else ""
cred_rows += (
f"<tr class='{cls}'><td>{em}{hvt}</td>"
f"<td class='pw'>{pw}</td><td>{src}</td><td>{bd}</td><td>{rs:.0f}</td></tr>"
)
# Dork results section
dork_results = data.get("dork_results", []) or []
dork_rows = ""
for h in dork_results:
url = h.get("url", "")
title = h.get("title", "") or h.get("dork", "")
snippet = h.get("snippet", "")
engine = h.get("engine", "")
link = (f'<a href="{_clean(url)}" style="color:#00ff41" target="_blank">{_clean(url[:80])}</a>'
if url else _clean(title[:80]))
dork_rows += (
f"<tr><td>{link}</td><td>{_clean(snippet[:120])}</td>"
f"<td>{_clean(h.get('dork','')[:80])}</td><td>{_clean(engine)}</td></tr>"
)
dork_section = (
f'<div class="sec"><h2>Dork Results ({len(dork_results)} hits)</h2>'
f'<table><thead><tr><th>URL / Title</th><th>Snippet</th><th>Dork Query</th><th>Engine</th></tr></thead>'
f'<tbody>{dork_rows if dork_rows else "<tr><td colspan=4 style=text-align:center>No dork hits</td></tr>"}</tbody></table></div>'
)
# Scrape results section
scrape_results = data.get("scrape_results", {}) or {}
pastes = scrape_results.get("pastes", [])
creds_sc = scrape_results.get("credentials", [])
tg_hits = scrape_results.get("telegram", [])
mc_hits = scrape_results.get("dork_misconfigs", [])
paste_rows = ""
for p in pastes:
site = _clean(p.get("site", ""))
pid = p.get("id", "")
pats = _clean(", ".join(f"{k}({len(v)})" for k, v in (p.get("patterns") or {}).items()))
paste_rows += f"<tr><td>{site}</td><td>{_clean(pid)}</td><td>{pats}</td></tr>"
cred_sc_rows = ""
for c in creds_sc:
cred_sc_rows += (
f"<tr><td class='pw'>{_clean(c.get('raw','')[:120])}</td>"
f"<td>{_clean(c.get('source',''))}</td><td>{_clean(c.get('paste_id',''))}</td></tr>"
)
tg_rows = ""
for t in tg_hits:
ch = _clean(t.get("channel", ""))
text = _clean(t.get("text", "")[:200])
pats = _clean(", ".join(f"{k}({len(v)})" for k, v in (t.get("patterns") or {}).items()))
link = f'<a href="https://t.me/s/{ch}" style="color:#00ff41" target="_blank">t.me/s/{ch}</a>'
tg_rows += f"<tr><td>{link}</td><td>{text}</td><td>{pats}</td></tr>"
mc_rows = ""
for m in mc_hits:
url_m = m.get("url", "")
title_m = _clean(m.get("title", "")[:80])
dork_m = _clean(m.get("dork", "")[:80])
link_m = (f'<a href="{_clean(url_m)}" style="color:#ff0040" target="_blank">{_clean(url_m[:80])}</a>'
if url_m else title_m)
mc_rows += f"<tr><td>{link_m}</td><td>{title_m}</td><td>{dork_m}</td></tr>"
scrape_section = (
f'<div class="sec"><h2>Scrape Results</h2>'
f'<h3 style="color:#aaa;margin:10px 0 5px">Pastes ({len(pastes)})</h3>'
f'<table><thead><tr><th>Site</th><th>Paste ID</th><th>Patterns</th></tr></thead>'
f'<tbody>{paste_rows or "<tr><td colspan=3 style=text-align:center>None</td></tr>"}</tbody></table>'
f'<h3 style="color:#aaa;margin:10px 0 5px">Extracted Credentials ({len(creds_sc)})</h3>'
f'<table><thead><tr><th>Raw Credential</th><th>Source</th><th>Paste ID</th></tr></thead>'
f'<tbody>{cred_sc_rows or "<tr><td colspan=3 style=text-align:center>None</td></tr>"}</tbody></table>'
f'<h3 style="color:#aaa;margin:10px 0 5px">Telegram CTI ({len(tg_hits)})</h3>'
f'<table><thead><tr><th>Channel</th><th>Message</th><th>Patterns</th></tr></thead>'
f'<tbody>{tg_rows or "<tr><td colspan=3 style=text-align:center>None</td></tr>"}</tbody></table>'
f'<h3 style="color:#aaa;margin:10px 0 5px">Misconfigurations ({len(mc_hits)})</h3>'
f'<table><thead><tr><th>URL</th><th>Title</th><th>Dork</th></tr></thead>'
f'<tbody>{mc_rows or "<tr><td colspan=3 style=text-align:center>None</td></tr>"}</tbody></table>'
f'</div>'
)
page = (
f'<!DOCTYPE html><html><head><meta charset="utf-8">'
f'<title>NOX — {target}</title><style>{_CSS}</style></head><body>'
f'<div class="hdr"><h1>[ NOX ]</h1>'
f'<p>Target: {target} &nbsp;|&nbsp; {ts} &nbsp;|&nbsp; NOX v{_nox_ver()}</p></div>'
f'<div class="sec"><h2>Executive Summary</h2>'
f'<div class="kpis">{kpis}</div>'
f'<table><thead><tr><th>Severity</th><th>Count</th></tr></thead>'
f'<tbody>{sev_rows}</tbody></table></div>'
f'<div class="sec"><h2>Pivot Chain</h2>{chain_html}</div>'
f'{dork_section}'
f'{scrape_section}'
f'<div class="sec"><h2>Credential Records (top 500)</h2>'
f'<table><thead><tr><th>Identity</th><th>Password</th><th>Source</th>'
f'<th>Date</th><th>Risk</th></tr></thead><tbody>{cred_rows}</tbody></table></div>'
f'</body></html>'
)
Path(path).write_text(page, encoding="utf-8")
print(f"[+] HTML report saved: {path}")
# ── PDF report (fpdf2) ────────────────────────────────────────────────
def to_pdf(data: dict, path: str, investigator_id: str = "NOX-AUTO") -> None:
# D1: raise a clear error with install hint if fpdf2 is absent — never silently return.
try:
from fpdf import FPDF # type: ignore
except ImportError:
msg = "[!] fpdf2 not installed — PDF report cannot be generated. Run: pip install fpdf2"
print(msg)
raise RuntimeError(msg)
summary = build_exec_summary(data)
chain = render_pivot_chain(data)
target = _raw(data.get("target", "Unknown"))
records = data.get("records", [])
ts = datetime.now().strftime("%Y-%m-%d %H:%M:%S UTC")
class _PDF(FPDF):
def header(self):
self.set_font("Helvetica", "B", 8)
self.set_text_color(120, 120, 120)
self.cell(0, 5, "NOX - FORENSIC INTELLIGENCE REPORT - CONFIDENTIAL", align="R")
self.ln(3)
def footer(self):
self.set_y(-12)
self.set_font("Helvetica", "", 8)
self.set_text_color(150, 150, 150)
self.cell(0, 5, _pdf_safe(f"Page {self.page_no()} | {target[:50]}"), align="C")
pdf = _PDF(orientation="P", unit="mm", format="A4")
pdf.set_auto_page_break(auto=True, margin=15)
pdf.set_margins(15, 15, 15)
# ── Cover page ────────────────────────────────────────────────────
pdf.add_page()
pdf.set_fill_color(15, 15, 15)
pdf.rect(0, 0, 210, 297, "F")
pdf.set_y(65)
pdf.set_font("Helvetica", "B", 26)
pdf.set_text_color(0, 220, 60)
pdf.cell(0, 12, "FORENSIC INTELLIGENCE REPORT", align="C")
pdf.ln(8)
pdf.set_font("Helvetica", "B", 13)
pdf.set_text_color(200, 200, 200)
pdf.cell(0, 8, _pdf_safe(f"Target: {target}"), align="C")
pdf.ln(6)
pdf.set_font("Helvetica", "", 10)
pdf.set_text_color(140, 140, 140)
for line in [f"Generated: {ts}", f"Investigator: {investigator_id}",
f"Framework: NOX v{_nox_ver()}", "Classification: RESTRICTED"]:
pdf.cell(0, 6, _pdf_safe(line), align="C")
pdf.ln(5)
# ── Executive Summary ─────────────────────────────────────────────
pdf.add_page()
pdf.set_fill_color(255, 255, 255)
pdf.set_text_color(0, 0, 0)
pdf.set_font("Helvetica", "B", 15)
pdf.cell(0, 10, "Executive Summary", ln=True)
pdf.set_draw_color(0, 180, 50)
pdf.set_line_width(0.4)
pdf.line(15, pdf.get_y(), 195, pdf.get_y())
pdf.ln(4)
kpis = [
("Total Time", summary["elapsed"]),
("Nodes Discovered", str(summary["nodes_discovered"])),
("Cleartext Passwords Found", str(summary["cleartext_passwords"])),
("Pivot Depth", str(summary["pivot_depth"])),
("Total Records", str(summary["total_records"])),
("High-Value Targets", str(summary["hvt_count"])),
]
pdf.set_font("Helvetica", "B", 10)
for label, value in kpis:
pdf.set_fill_color(245, 245, 245)
pdf.cell(95, 7, _pdf_safe(label), border=1, fill=True)
pdf.set_font("Helvetica", "", 10)
pdf.cell(80, 7, _pdf_safe(value), border=1, ln=True)
pdf.set_font("Helvetica", "B", 10)
pdf.ln(4)
# Severity breakdown
pdf.set_font("Helvetica", "B", 11)
pdf.cell(0, 7, "Severity Breakdown", ln=True)
_sev_c = {"Critical": (220,0,30), "High": (220,100,0),
"Medium": (200,180,0), "Low": (0,150,50), "Info": (100,100,100)}
total_b = max(sum(summary["buckets"].values()), 1)
for level, count in summary["buckets"].items():
pdf.set_font("Helvetica", "", 9)
pdf.cell(35, 6, _pdf_safe(level), border=1)
pdf.cell(20, 6, str(count), border=1)
bar_w = int(count / total_b * 120)
x, y = pdf.get_x(), pdf.get_y()
pdf.cell(125, 6, "", border=1)
if bar_w:
rc, gc, bc = _sev_c.get(level, (100, 100, 100))
pdf.set_fill_color(rc, gc, bc)
pdf.rect(x + 1, y + 1, bar_w, 4, "F")
pdf.ln()
# ── Pivot Chain ───────────────────────────────────────────────────
pdf.ln(5)
pdf.set_font("Helvetica", "B", 11)
pdf.cell(0, 7, "Pivot Chain Visualization", ln=True)
pdf.line(15, pdf.get_y(), 195, pdf.get_y())
pdf.ln(3)
pdf.set_font("Courier", "", 8)
pdf.set_fill_color(240, 255, 240)
for c_line in chain:
# Word-wrap long chains at 100 chars
for chunk in [c_line[i:i+100] for i in range(0, max(len(c_line), 1), 100)]:
pdf.set_x(15)
pdf.cell(180, 5, _pdf_safe(chunk), border=0, ln=True, fill=True)
pdf.ln(3)
# ── Credential Findings ───────────────────────────────────────────
pdf.add_page()
pdf.set_font("Helvetica", "B", 13)
pdf.set_text_color(0, 0, 0)
pdf.cell(0, 9, "Credential Findings", ln=True)
pdf.line(15, pdf.get_y(), 195, pdf.get_y())
pdf.ln(3)
cols = [("Identity", 60), ("Password", 45), ("Source", 35), ("Date", 25), ("Risk", 15)]
def _write_col_headers():
pdf.set_font("Helvetica", "B", 8)
pdf.set_fill_color(30, 30, 30)
pdf.set_text_color(255, 255, 255)
for col_name, col_w in cols:
pdf.cell(col_w, 6, col_name, border=1, fill=True)
pdf.ln()
pdf.set_text_color(0, 0, 0)
_write_col_headers()
for r in records[:500]:
pw = _rget(r, "password")
if not pw and not _rget(r, "email") and not _rget(r, "username"):
continue # skip noise rows with no actionable data
rs = float(_rget(r, "risk_score") or 0)
if rs >= 90: pdf.set_fill_color(255, 220, 220)
elif rs >= 70: pdf.set_fill_color(255, 240, 220)
else: pdf.set_fill_color(255, 255, 255)
pdf.set_font("Helvetica", "", 7)
# Auto page-break with repeated column headers (§5.1)
if pdf.get_y() > pdf.h - 25:
pdf.add_page()
_write_col_headers()
vals = [
_pdf_safe(_rget(r, "email") or _rget(r, "username"), 38),
_pdf_safe(pw, 28),
_pdf_safe(_rget(r, "source"), 22),
_pdf_safe(_rget(r, "breach_date"), 14),
f"{rs:.0f}",
]
for val, (_, w) in zip(vals, cols):
pdf.cell(w, 5, val, border=1, fill=True)
pdf.ln()
# ── Dork Results ─────────────────────────────────────────────────
dork_results = data.get("dork_results", []) or []
if dork_results:
pdf.add_page()
pdf.set_font("Helvetica", "B", 13)
pdf.set_text_color(0, 0, 0)
pdf.cell(0, 9, _pdf_safe(f"Dork Results ({len(dork_results)} hits)"), ln=True)
pdf.line(15, pdf.get_y(), 195, pdf.get_y())
pdf.ln(3)
pdf.set_font("Helvetica", "B", 8)
pdf.set_fill_color(30, 30, 30); pdf.set_text_color(255, 255, 255)
for col_name, col_w in [("URL / Title", 95), ("Snippet", 55), ("Engine", 30)]:
pdf.cell(col_w, 6, col_name, border=1, fill=True)
pdf.ln(); pdf.set_text_color(0, 0, 0)
for h in dork_results[:200]:
pdf.set_fill_color(245, 245, 255); pdf.set_font("Helvetica", "", 7)
url = _pdf_safe(h.get("url", h.get("title", "")), 65)
snippet = _pdf_safe(h.get("snippet", ""), 38)
engine = _pdf_safe(h.get("engine", ""), 20)
for val, w in zip([url, snippet, engine], [95, 55, 30]):
pdf.cell(w, 5, val, border=1, fill=True)
pdf.ln()
# ── Scrape Results ────────────────────────────────────────────────
scrape_results = data.get("scrape_results", {}) or {}
pastes = scrape_results.get("pastes", [])
creds_sc = scrape_results.get("credentials", [])
tg_hits = scrape_results.get("telegram", [])
mc_hits = scrape_results.get("dork_misconfigs", [])
if pastes or creds_sc or tg_hits or mc_hits:
pdf.add_page()
pdf.set_font("Helvetica", "B", 13)
pdf.set_text_color(0, 0, 0)
pdf.cell(0, 9, "Scrape Results", ln=True)
pdf.line(15, pdf.get_y(), 195, pdf.get_y())
pdf.ln(3)
if pastes:
pdf.set_font("Helvetica", "B", 10)
pdf.cell(0, 7, _pdf_safe(f"Pastes ({len(pastes)})"), ln=True)
pdf.set_font("Helvetica", "B", 8)
pdf.set_fill_color(30, 30, 30); pdf.set_text_color(255, 255, 255)
for col_name, col_w in [("Site", 25), ("Paste ID", 80), ("Patterns", 75)]:
pdf.cell(col_w, 6, col_name, border=1, fill=True)
pdf.ln(); pdf.set_text_color(0, 0, 0)
for p in pastes[:100]:
pdf.set_fill_color(245, 245, 245); pdf.set_font("Helvetica", "", 7)
site = _pdf_safe(p.get("site", ""), 15)
pid = _pdf_safe(p.get("id", ""), 55)
pats = _pdf_safe(", ".join(f"{k}({len(v)})" for k, v in (p.get("patterns") or {}).items()), 50)
for val, w in zip([site, pid, pats], [25, 80, 75]):
pdf.cell(w, 5, val, border=1, fill=True)
pdf.ln()
pdf.ln(3)
if creds_sc:
pdf.set_font("Helvetica", "B", 10)
pdf.cell(0, 7, _pdf_safe(f"Extracted Credentials ({len(creds_sc)})"), ln=True)
pdf.set_font("Helvetica", "B", 8)
pdf.set_fill_color(30, 30, 30); pdf.set_text_color(255, 255, 255)
for col_name, col_w in [("Raw Credential", 120), ("Source", 30), ("Paste ID", 30)]:
pdf.cell(col_w, 6, col_name, border=1, fill=True)
pdf.ln(); pdf.set_text_color(0, 0, 0)
for c in creds_sc[:150]:
pdf.set_fill_color(255, 240, 240); pdf.set_font("Helvetica", "", 7)
raw = _pdf_safe(c.get("raw", ""), 80)
src = _pdf_safe(c.get("source", ""), 20)
pid = _pdf_safe(c.get("paste_id", ""), 20)
for val, w in zip([raw, src, pid], [120, 30, 30]):
pdf.cell(w, 5, val, border=1, fill=True)
pdf.ln()
pdf.ln(3)
if tg_hits:
pdf.set_font("Helvetica", "B", 10)
pdf.cell(0, 7, _pdf_safe(f"Telegram CTI ({len(tg_hits)})"), ln=True)
pdf.set_font("Helvetica", "B", 8)
pdf.set_fill_color(30, 30, 30); pdf.set_text_color(255, 255, 255)
for col_name, col_w in [("Channel", 50), ("Message Excerpt", 100), ("Patterns", 30)]:
pdf.cell(col_w, 6, col_name, border=1, fill=True)
pdf.ln(); pdf.set_text_color(0, 0, 0)
for t in tg_hits[:80]:
pdf.set_fill_color(245, 245, 255); pdf.set_font("Helvetica", "", 7)
link = _pdf_safe(f"t.me/s/{t.get('channel','')}", 35)
text = _pdf_safe(t.get("text", ""), 70)
pats = _pdf_safe(", ".join(f"{k}({len(v)})" for k, v in (t.get("patterns") or {}).items()), 25)
for val, w in zip([link, text, pats], [50, 100, 30]):
pdf.cell(w, 5, val, border=1, fill=True)
pdf.ln()
pdf.ln(3)
if mc_hits:
pdf.set_font("Helvetica", "B", 10)
pdf.cell(0, 7, _pdf_safe(f"Misconfigurations ({len(mc_hits)})"), ln=True)
pdf.set_font("Helvetica", "B", 8)
pdf.set_fill_color(30, 30, 30); pdf.set_text_color(255, 255, 255)
for col_name, col_w in [("URL", 90), ("Title", 60), ("Dork", 30)]:
pdf.cell(col_w, 6, col_name, border=1, fill=True)
pdf.ln(); pdf.set_text_color(0, 0, 0)
for m in mc_hits[:80]:
pdf.set_fill_color(255, 245, 230); pdf.set_font("Helvetica", "", 7)
url_m = _pdf_safe(m.get("url", ""), 60)
title_m = _pdf_safe(m.get("title", ""), 40)
dork_m = _pdf_safe(m.get("dork", ""), 25)
for val, w in zip([url_m, title_m, dork_m], [90, 60, 30]):
pdf.cell(w, 5, val, border=1, fill=True)
pdf.ln()
pdf.output(path)
print(f"[+] PDF report saved: {path}")
+525
View File
@@ -0,0 +1,525 @@
"""
sources/helpers/scanner.py
Recursive Avalanche Engine for NOX autoscan.
Pipeline per asset (sequential phases):
Phase 1 — Breach scan
Phase 2 — Hash crack (non-blocking, on breach results)
Phase 3 — Dork
Phase 4 — Scrape
→ Harvest new identifiers from all phases
→ Reinject every new unique identifier (not seen before) recursively
"""
import asyncio
import logging
import re
from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
if TYPE_CHECKING:
from nox import Orchestrator
_syslog = logging.getLogger("nox.system")
_EMAIL_RE = re.compile(r"[\w.+-]+@[\w-]+\.[\w.]+")
_USERNAME_RE = re.compile(r"(?:github\.com|twitter\.com|linkedin\.com/in|reddit\.com/u)/([A-Za-z0-9_.-]{3,39})", re.I)
_PHONE_RE = re.compile(r"\+\d[\d\s.\-()]{7,14}\d|\b\d{3}[\s.\-]\d{3}[\s.\-]\d{4}\b")
_NAME_RE = re.compile(r"\b([A-Z][a-z]{1,20}(?:\s+[A-Z][a-z]{1,20}){1,3})\b")
_DORK_LIMIT = 20
_PIVOT_TYPES = {"email", "username", "phone", "name", "ip", "domain"}
def _cfg_depth(orc=None) -> int:
# A7/A10: read from orchestrator config if available
if orc is not None:
cfg = getattr(orc, "config", None)
if cfg is not None:
v = getattr(cfg, "pivot_depth", None)
if v is not None:
return int(v)
try:
from nox import Cfg # type: ignore
return Cfg.PIVOT_DEPTH
except ImportError:
return 2
def _cfg_concurrency(orc=None) -> int:
# A7: read from orchestrator config if available
if orc is not None:
cfg = getattr(orc, "config", None)
if cfg is not None:
v = getattr(cfg, "concurrency", None)
if v is not None:
return int(v)
try:
from nox import Cfg # type: ignore
return Cfg.CONCURRENCY
except ImportError:
return 15
def _out(level: str, msg: str) -> None:
try:
from nox import out as _nox_out # type: ignore
_nox_out(level, msg)
except Exception:
import sys
print(f"[{level}] {msg}", file=sys.stderr)
def _extract_ids_from_text(text: str, exclude: str = "") -> List[Tuple[str, str]]:
"""Extract pivotable identifiers from free text, excluding the current asset."""
found: List[Tuple[str, str]] = []
excl = exclude.lower()
for m in _EMAIL_RE.findall(text):
v = m.lower()
if v != excl:
found.append((v, "email"))
for m in _USERNAME_RE.findall(text):
v = m.lower()
if v != excl:
found.append((v, "username"))
for m in _PHONE_RE.findall(text):
clean = re.sub(r"[\s.\-()]", "", m)
if 8 <= len(clean) <= 15 and clean != excl:
found.append((clean, "phone"))
for m in _NAME_RE.findall(text):
if len(m.split()) >= 2 and m.lower() != excl:
found.append((m, "name"))
return found
def _ids_from_records(records: list, exclude: str = "") -> List[Tuple[str, str, str]]:
"""
Extract pivotable identifiers from breach records.
Returns (value, qtype, ref) where ref is the source/breach name for logging.
"""
found: List[Tuple[str, str, str]] = []
excl = exclude.lower()
for r in records:
src = getattr(r, "source", "") or ""
breach = getattr(r, "breach_name", "") or src
for val, qtype in [
(getattr(r, "email", ""), "email"),
(getattr(r, "username", ""), "username"),
(getattr(r, "phone", ""), "phone"),
(getattr(r, "full_name", ""), "name"),
(getattr(r, "ip_address", ""), "ip"),
(getattr(r, "domain", ""), "domain"),
]:
if val and len(val) > 2 and val.lower() != excl:
found.append((val.strip(), qtype, breach))
meta = getattr(r, "metadata", {}) or {}
for em in meta.get("emails", []):
if em and em.lower() != excl:
found.append((em.lower(), "email", breach))
return found
# ── Pivot log entry schema ─────────────────────────────────────────────────
# {
# "asset": str, # identifier scanned
# "qtype": str, # email/username/phone/name/domain/ip
# "depth": int, # 0=seed, 1=first pivot, …
# "parent": str|None, # asset that discovered this one
# "found_in": str, # phase that found this asset: seed/breach/dork/scrape/hash_crack
# "records": int, # breach records found for this asset
# "dorks": int, # dork hits found for this asset
# "scrape": int, # scrape items found for this asset
# "children": List[dict], # [{asset, qtype, found_in, ref}] — new assets discovered
# "cracked": List[str], # plaintexts cracked from hashes in breach results
# }
class AvalancheScanner:
def __init__(self, orchestrator: "Orchestrator") -> None:
self._orc = orchestrator
self.seen_assets: Set[str] = set()
# A2: single semaphore for the entire run, created lazily inside the event loop
self._sem: Optional[asyncio.Semaphore] = None
self._all_records: List = []
self._dork_hits: List[dict] = []
self._seen_dork_urls: Set[str] = set()
# A6: scrape_hits merged atomically per _do_process call
self._scrape_hits: Dict = {"pastes": [], "credentials": [], "hashes": [],
"telegram": [], "dork_misconfigs": []}
self._max_depth: int = 0
self._in_flight: Dict[str, asyncio.Future] = {}
self.pivot_log: List[dict] = []
# A8: global set to prevent duplicate entries in discovered_assets
self._seen_discovered: Set[str] = set()
self.discovered_assets: List[dict] = []
def _get_sem(self) -> asyncio.Semaphore:
# A2: semaphore created once per run, shared across all coroutines
if self._sem is None:
self._sem = asyncio.Semaphore(_cfg_concurrency(self._orc))
return self._sem
async def run(self, target: str) -> tuple:
# A9: respect no_pivot flag from config
cfg = getattr(self._orc, "config", None)
no_pivot = getattr(cfg, "no_pivot", False) if cfg else False
if no_pivot:
try:
from nox import Detect # type: ignore
qtype = Detect.qtype(target)
except ImportError:
qtype = "email"
async with self._get_sem():
try:
records = await self._orc._full_async_scan(target, qtype)
except Exception:
records = []
self._all_records.extend(records)
self.seen_assets.add(target.lower().strip())
self.pivot_log.append({
"asset": target, "qtype": qtype, "depth": 0, "parent": None,
"found_in": "seed", "records": len(records), "dorks": 0,
"scrape": 0, "children": [], "cracked": [],
})
return self._all_records, self._dork_hits, self._scrape_hits
await self._process(target, depth=0, parent=None, found_in="seed")
return self._all_records, self._dork_hits, self._scrape_hits
def get_discovered_assets(self) -> List[dict]:
"""Return flat list of all discovered assets with full provenance."""
return self.discovered_assets
def get_max_depth(self) -> int:
return self._max_depth
# ── Dedup gate ────────────────────────────────────────────────────
async def _process(self, asset: str, depth: int,
parent: Optional[str], found_in: str) -> None:
"""Dedup gate: ensures each asset is processed exactly once."""
# A10: use per-run depth from orchestrator config
if depth > _cfg_depth(self._orc):
_syslog.debug("avalanche depth cap reached for %s", asset)
return
key = asset.lower().strip()
if not key:
return
# A1: add to seen_assets FIRST (atomic gate) before any other check.
# If already present, wait on the in-flight future if one exists, then return.
if key in self.seen_assets:
if key in self._in_flight:
try:
await self._in_flight[key]
except Exception:
pass
return
self.seen_assets.add(key)
# If already in-flight (shouldn't happen after the seen_assets check above,
# but guard defensively), wait and return.
if key in self._in_flight:
try:
await self._in_flight[key]
except Exception:
pass
return
loop = asyncio.get_running_loop()
fut: asyncio.Future = loop.create_future()
self._in_flight[key] = fut
try:
await self._do_process(asset, depth, parent, found_in)
finally:
if not fut.done():
fut.set_result(None)
# ── Core pipeline ─────────────────────────────────────────────────
async def _do_process(self, asset: str, depth: int,
parent: Optional[str], found_in: str) -> None:
"""
Sequential pipeline:
Phase 1 — Breach scan
Phase 2 — Hash crack (concurrent, non-blocking)
Phase 3 — Dork
Phase 4 — Scrape
→ Harvest all new identifiers with phase+ref annotation
→ Reinject every unseen identifier
"""
if depth > self._max_depth:
self._max_depth = depth
try:
from nox import Detect # type: ignore
qtype = Detect.qtype(asset)
except ImportError:
qtype = "email"
indent = " " * depth
_out("pivot" if depth > 0 else "info",
f"{indent}[depth={depth}] {'' if depth > 0 else ''} {asset} ({qtype})"
+ (f"{found_in} via {parent}" if parent else " [SEED]"))
_syslog.info("AVALANCHE asset=%s depth=%d parent=%s found_in=%s",
asset, depth, parent or "", found_in)
# ── Phase 1: Breach scan ──────────────────────────────────────
async with self._get_sem():
try:
records: List = await self._orc._full_async_scan(asset, qtype)
except Exception as exc:
_syslog.warning("BREACH_FAIL asset=%s err=%s", asset, exc)
records = []
_out("ok" if records else "dim",
f"{indent} [breach] {len(records)} records")
_syslog.info("BREACH_DONE asset=%s records=%d", asset, len(records))
self._all_records.extend(records)
# ── Phase 2: Hash crack (non-blocking) ────────────────────────
cracked_plaintexts: List[str] = []
try:
from sources.helpers.cracker import detect_hash # type: ignore
import aiohttp as _aio # type: ignore
async with _aio.ClientSession(connector=_aio.TCPConnector(limit=5)) as _cs:
crack_tasks = [
_crack_and_inject(_cs, getattr(r, "password_hash", ""), r,
self.seen_assets, self._all_records,
self, depth, asset, cracked_plaintexts)
for r in records
if getattr(r, "password_hash", "") and not getattr(r, "password", "")
and detect_hash(getattr(r, "password_hash", ""))
]
if crack_tasks:
await asyncio.gather(*crack_tasks, return_exceptions=True)
except ImportError:
pass
# ── Phase 3: Dork ─────────────────────────────────────────────
_out("info", f"{indent} [dork] querying for {asset}")
try:
dork_res = await self._async_dork(asset, qtype)
except Exception as exc:
_syslog.warning("DORK_FAIL asset=%s err=%s", asset, exc)
dork_res = []
dork_count = 0
for hit in (dork_res or [])[:_DORK_LIMIT]:
url = hit.get("url", "") or hit.get("title", "")
if url and url not in self._seen_dork_urls:
self._seen_dork_urls.add(url)
hit["pivot_asset"] = asset
hit["pivot_depth"] = depth
self._dork_hits.append(hit)
dork_count += 1
_out("ok" if dork_count else "dim",
f"{indent} [dork] {dork_count} hits")
_syslog.info("DORK_DONE asset=%s hits=%d", asset, dork_count)
# ── Phase 4: Scrape ───────────────────────────────────────────
_out("info", f"{indent} [scrape] querying for {asset}")
try:
scrape_res = await self._async_scrape(asset)
except Exception as exc:
_syslog.warning("SCRAPE_FAIL asset=%s err=%s", asset, exc)
scrape_res = {}
# A6: collect scrape results locally, then merge atomically
scrape_count = 0
local_scrape: Dict = {k: [] for k in self._scrape_hits}
for k in self._scrape_hits:
for item in (scrape_res or {}).get(k, []):
if isinstance(item, dict):
item["pivot_asset"] = asset
item["pivot_depth"] = depth
local_scrape[k].append(item)
scrape_count += 1
# Atomic merge into shared dict (single-threaded event loop — safe)
for k, items in local_scrape.items():
self._scrape_hits[k].extend(items)
_out("ok" if scrape_count else "dim",
f"{indent} [scrape] {scrape_count} items")
_syslog.info("SCRAPE_DONE asset=%s items=%d", asset, scrape_count)
# ── Harvest new identifiers with phase+ref annotation ─────────
# Each entry: (value, qtype, found_in_phase, ref)
new_ids: List[Tuple[str, str, str, str]] = []
# From breach records
for val, vqtype, ref in _ids_from_records(records, exclude=asset):
if vqtype in _PIVOT_TYPES:
new_ids.append((val, vqtype, "breach", ref))
# From dork hits
for hit in (dork_res or [])[:_DORK_LIMIT]:
url = hit.get("url", "")
dork = hit.get("dork", "")
ref = url or dork
text = f"{hit.get('title','')} {hit.get('snippet','')} {url} {dork}"
for val, vqtype in _extract_ids_from_text(text, exclude=asset):
if vqtype in _PIVOT_TYPES:
new_ids.append((val, vqtype, "dork", ref[:120]))
# From scrape results
for cred in (scrape_res or {}).get("credentials", []):
raw = cred.get("raw", "")
ref = f"paste:{cred.get('paste_id','')}" or cred.get("source", "scrape")
for val, vqtype in _extract_ids_from_text(raw, exclude=asset):
if vqtype in _PIVOT_TYPES:
new_ids.append((val, vqtype, "scrape", ref))
for tg in (scrape_res or {}).get("telegram", []):
ref = f"t.me/{tg.get('channel','')}"
for val, vqtype in _extract_ids_from_text(tg.get("text", ""), exclude=asset):
if vqtype in _PIVOT_TYPES:
new_ids.append((val, vqtype, "scrape", ref))
for mc in (scrape_res or {}).get("dork_misconfigs", []):
ref = mc.get("url", mc.get("title", "misconfig"))
for val, vqtype in _extract_ids_from_text(
f"{mc.get('title','')} {mc.get('snippet','')}", exclude=asset):
if vqtype in _PIVOT_TYPES:
new_ids.append((val, vqtype, "scrape", ref[:120]))
# ── Deduplicate and queue children ────────────────────────────
children: List[dict] = []
child_tasks = []
queued: Set[str] = set()
for val, vqtype, phase, ref in new_ids:
child_key = val.lower().strip()
if not child_key or child_key in self.seen_assets or child_key in queued:
continue
queued.add(child_key)
child_entry = {"asset": val, "qtype": vqtype, "found_in": phase, "ref": ref}
children.append(child_entry)
# A8: prevent duplicate entries in discovered_assets across parallel parents
if child_key not in self._seen_discovered:
self._seen_discovered.add(child_key)
self.discovered_assets.append({
"asset": val,
"qtype": vqtype,
"phase": phase,
"ref": ref,
"parent": asset,
"depth": depth + 1,
})
_out("pivot",
f"{indent} ↳ new asset [{phase}]: {val} ({vqtype}) ref: {ref[:60]}")
_syslog.info("PIVOT_QUEUE asset=%s qtype=%s phase=%s ref=%s parent=%s depth=%d",
val, vqtype, phase, ref[:80], asset, depth + 1)
child_tasks.append(
self._process(val, depth + 1, parent=asset, found_in=phase)
)
# A5: run child tasks FIRST, then append pivot_log so the log reflects actual outcomes
if child_tasks:
_out("info", f"{indent} → reinjecting {len(child_tasks)} new asset(s)…")
await asyncio.gather(*child_tasks, return_exceptions=True)
# ── Log this node (after children complete — A5) ──────────────
self.pivot_log.append({
"asset": asset,
"qtype": qtype,
"depth": depth,
"parent": parent,
"found_in": found_in,
"records": len(records),
"dorks": dork_count,
"scrape": scrape_count,
"children": children,
"cracked": cracked_plaintexts or [],
})
# ── Dork dispatcher ───────────────────────────────────────────────
async def _async_dork(self, asset: str, qtype: str = "email") -> list:
try:
import aiohttp as _aio # type: ignore
import ssl as _ssl
connector = _aio.TCPConnector(limit=10, ssl=_ssl.create_default_context(), family=0)
async with _aio.ClientSession(connector=connector) as session:
recs = await self._orc.dorking_engine.async_search(session, asset, qtype)
return [
{
"url": r.raw_data.get("url", "") if hasattr(r, "raw_data") else "",
"title": r.raw_data.get("url", r.raw_data.get("dork", "")) if hasattr(r, "raw_data") else "",
"snippet": "",
"dork": r.raw_data.get("dork", "") if hasattr(r, "raw_data") else "",
"engine": "DDG",
}
for r in recs
]
except ImportError:
loop = asyncio.get_running_loop()
result = await loop.run_in_executor(None, self._orc.dork, asset)
return result if isinstance(result, list) else []
except Exception as exc:
_syslog.debug("DORK_ERR asset=%s err=%s", asset, exc)
return []
# ── Scrape dispatcher ─────────────────────────────────────────────
async def _async_scrape(self, asset: str) -> dict:
# A3: instantiate a fresh Session + ScrapeEngine per call to avoid sharing
# a non-thread-safe requests.Session / cloudscraper across concurrent coroutines.
_empty: dict = {"pastes": [], "credentials": [], "hashes": [],
"telegram": [], "dork_misconfigs": []}
try:
loop = asyncio.get_running_loop()
try:
from nox import Session, NoxConfig, ScrapeEngine # type: ignore
_cfg = getattr(self._orc, "config", None) or NoxConfig()
_session = Session(_cfg)
_engine = ScrapeEngine(_session, self._orc.db)
qtype = "email"
try:
from nox import Detect # type: ignore
qtype = Detect.qtype(asset)
except Exception:
pass
result = await loop.run_in_executor(None, _engine.run, asset, qtype)
except Exception:
result = await loop.run_in_executor(None, self._orc.scrape, asset)
return result if isinstance(result, dict) else _empty
except Exception as exc:
_syslog.debug("SCRAPE_ERR asset=%s err=%s", asset, exc)
return _empty
# ── Hash crack helper ──────────────────────────────────────────────────────
async def _crack_and_inject(session, hash_value: str, record_ref,
seen_assets: Set[str], all_records: list,
scanner: "AvalancheScanner",
depth: int, parent_asset: str,
cracked_out: List[str]) -> None:
from sources.helpers.cracker import detect_hash, async_crack, CRACK_TIMEOUT # type: ignore
hash_type = detect_hash(hash_value)
if not hash_type:
return
try:
plaintext = await asyncio.wait_for(
async_crack(session, hash_value, hash_type), timeout=CRACK_TIMEOUT)
except (asyncio.TimeoutError, Exception) as exc:
_syslog.debug("CRACK_FAIL hash=%s reason=%s", hash_value[:16], exc)
return
if not plaintext:
_syslog.debug("CRACK_FAIL hash=%s reason=no_result", hash_value[:16])
return
record_ref.password = plaintext
record_ref.hash_type = hash_type
if "Cracked" not in (record_ref.data_types or []):
record_ref.data_types = list(record_ref.data_types) + ["Cracked"]
_syslog.info("CRACK_OK hash=%s plain=%s parent=%s", hash_value[:16], plaintext, parent_asset)
_out("ok", f" [crack] {hash_value[:16]}… → {plaintext} (from {parent_asset})")
cracked_out.append(plaintext)
# A4: inject cracked plaintext as qtype="password" — NOT as username.
# Only pivot on it if sources support password-recycling queries.
key = plaintext.lower()
if key not in seen_assets and depth + 1 <= _cfg_depth(scanner._orc):
await scanner._process(plaintext, depth + 1,
parent=parent_asset, found_in="hash_crack")