ThreatHunt/_perf_edit_host_inventory_budgets.py

from pathlib import Path
p=Path(r'd:/Projects/Dev/ThreatHunt/backend/app/services/host_inventory.py')
t=p.read_text(encoding='utf-8')

# insert budget vars near existing counters
old='''    connections: dict[tuple, int] = defaultdict(int)
    total_rows = 0
    ds_with_hosts = 0
'''
new='''    connections: dict[tuple, int] = defaultdict(int)
    total_rows = 0
    ds_with_hosts = 0
    sampled_dataset_count = 0
    total_row_budget = max(0, int(settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS))
    max_connections = max(0, int(settings.NETWORK_INVENTORY_MAX_CONNECTIONS))
    global_budget_reached = False
    dropped_connections = 0
'''
if old not in t:
    raise SystemExit('counter block not found')
t=t.replace(old,new)

# update batch size and sampled count increments + global budget checks
old2='''        batch_size = 10000
        max_rows_per_dataset = max(0, int(settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET))
        rows_scanned_this_dataset = 0
        sampled_dataset = False
        last_row_index = -1
        while True:
'''
new2='''        batch_size = 5000
        max_rows_per_dataset = max(0, int(settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET))
        rows_scanned_this_dataset = 0
        sampled_dataset = False
        last_row_index = -1
        while True:
            if total_row_budget and total_rows >= total_row_budget:
                global_budget_reached = True
                break
'''
if old2 not in t:
    raise SystemExit('batch block not found')
t=t.replace(old2,new2)

old3='''                if max_rows_per_dataset and rows_scanned_this_dataset >= max_rows_per_dataset:
                    sampled_dataset = True
                    break

                data = ro.data or {}
                total_rows += 1
                rows_scanned_this_dataset += 1
'''
new3='''                if max_rows_per_dataset and rows_scanned_this_dataset >= max_rows_per_dataset:
                    sampled_dataset = True
                    break
                if total_row_budget and total_rows >= total_row_budget:
                    sampled_dataset = True
                    global_budget_reached = True
                    break

                data = ro.data or {}
                total_rows += 1
                rows_scanned_this_dataset += 1
'''
if old3 not in t:
    raise SystemExit('row scan block not found')
t=t.replace(old3,new3)

# cap connection map growth
old4='''                for c in cols['remote_ip']:
                    rip = _clean(data.get(c))
                    if _is_valid_ip(rip):
                        rport = ''
                        for pc in cols['remote_port']:
                            rport = _clean(data.get(pc))
                            if rport:
                                break
                        connections[(host_key, rip, rport)] += 1
'''
new4='''                for c in cols['remote_ip']:
                    rip = _clean(data.get(c))
                    if _is_valid_ip(rip):
                        rport = ''
                        for pc in cols['remote_port']:
                            rport = _clean(data.get(pc))
                            if rport:
                                break
                        conn_key = (host_key, rip, rport)
                        if max_connections and len(connections) >= max_connections and conn_key not in connections:
                            dropped_connections += 1
                            continue
                        connections[conn_key] += 1
'''
if old4 not in t:
    raise SystemExit('connection block not found')
t=t.replace(old4,new4)

# sampled_dataset counter
old5='''            if sampled_dataset:
                logger.info(
                    "Host inventory row budget reached for dataset %s (%d rows)",
                    ds.id,
                    rows_scanned_this_dataset,
                )
                break
'''
new5='''            if sampled_dataset:
                sampled_dataset_count += 1
                logger.info(
                    "Host inventory row budget reached for dataset %s (%d rows)",
                    ds.id,
                    rows_scanned_this_dataset,
                )
                break
'''
if old5 not in t:
    raise SystemExit('sampled block not found')
t=t.replace(old5,new5)

# break dataset loop if global budget reached
old6='''            if len(rows) < batch_size:
                break

    # Post-process hosts
'''
new6='''            if len(rows) < batch_size:
                break

        if global_budget_reached:
            logger.info(
                "Host inventory global row budget reached for hunt %s at %d rows",
                hunt_id,
                total_rows,
            )
            break

    # Post-process hosts
'''
if old6 not in t:
    raise SystemExit('post-process boundary block not found')
t=t.replace(old6,new6)

# add stats
old7='''            "row_budget_per_dataset": settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET,
            "sampled_mode": settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET > 0,
        },
    }
'''
new7='''            "row_budget_per_dataset": settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET,
            "row_budget_total": settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS,
            "connection_budget": settings.NETWORK_INVENTORY_MAX_CONNECTIONS,
            "sampled_mode": settings.NETWORK_INVENTORY_MAX_ROWS_PER_DATASET > 0 or settings.NETWORK_INVENTORY_MAX_TOTAL_ROWS > 0,
            "sampled_datasets": sampled_dataset_count,
            "global_budget_reached": global_budget_reached,
            "dropped_connections": dropped_connections,
        },
    }
'''
if old7 not in t:
    raise SystemExit('stats block not found')
t=t.replace(old7,new7)

p.write_text(t,encoding='utf-8')
print('updated host inventory with global row and connection budgets')