chorus-services/modules/shhh/pipeline/processor.py

import asyncio
from core.hypercore_reader import HypercoreReader
from core.detector import SecretDetector
from core.llm_analyzer import LLMAnalyzer
from core.quarantine import QuarantineManager
from core.sanitized_writer import SanitizedWriter

class MessageProcessor:
    def __init__(self, reader: HypercoreReader, detector: SecretDetector, llm_analyzer: LLMAnalyzer, quarantine: QuarantineManager, writer: SanitizedWriter, llm_threshold: float):
        self.reader = reader
        self.detector = detector
        self.llm_analyzer = llm_analyzer
        self.quarantine = quarantine
        self.writer = writer
        self.llm_threshold = llm_threshold # e.g., 0.90

    async def process_stream(self):
        """Main processing loop for the hybrid detection model."""
        async for entry in self.reader.stream_entries():
            # Stage 1: Fast Regex Scan
            regex_matches = self.detector.scan(entry.content)

            if not regex_matches:
                # No secrets found, write original entry to sanitized log
                self.writer.write(entry.content)
                continue

            # A potential secret was found. Default to sanitized, but may be quarantined.
            sanitized_content = entry.content
            should_quarantine = False
            confirmed_secret = None

            for match in regex_matches:
                # High-confidence regex matches trigger immediate quarantine, skipping LLM.
                if match['confidence'] >= self.llm_threshold:
                    should_quarantine = True
                    confirmed_secret = match
                    break # One high-confidence match is enough

                # Stage 2: Low-confidence matches go to LLM for verification.
                llm_result = self.llm_analyzer.analyze(entry.content)
                if llm_result.get("secret_found"):
                    should_quarantine = True
                    # Prefer LLM's classification but use regex value for redaction
                    confirmed_secret = {
                        "secret_type": llm_result.get("secret_type", match['secret_type']),
                        "value": match['value'],
                        "severity": llm_result.get("severity", match['severity'])
                    }
                    break

            if should_quarantine and confirmed_secret:
                # A secret is confirmed. Redact, quarantine, and alert.
                sanitized_content = self.detector.redact(entry.content, confirmed_secret['value'])

                self.quarantine.quarantine_message(
                    message=entry,
                    secret_type=confirmed_secret['secret_type'],
                    severity=confirmed_secret['severity'],
                    redacted_content=sanitized_content
                )
                # Potentially trigger alerts here as well
                print(f"[ALERT] Confirmed secret {confirmed_secret['secret_type']} found and quarantined.")

            # Write the (potentially redacted) content to the sanitized log
            self.writer.write(sanitized_content)