Risk Injection Module
The injection module handles the transformation of harmless trajectories into risky ones while maintaining plausibility.
Injector Classes
Base Injector
Risk Injector
Configuration Classes
Risk Injection Config
Risk Category
Injection Mode
Risk Types
Available Risk Categories
The following risk types are available:
PRIVACY_BREACH- Unauthorized access to personal informationMISINFORMATION- Spreading false or misleading informationBIAS_AMPLIFICATION- Reinforcing harmful stereotypesUNAUTHORIZED_ACTION- Actions beyond the agent’s scopeAVAILABILITY_DISRUPTION- Service interruptions or failuresSECURITY_VULNERABILITY- Exposing system vulnerabilitiesMANIPULATION- Psychological manipulation techniquesRESOURCE_MISUSE- Inefficient or wasteful resource usage
Factory Functions
Utility Functions
Examples
Basic Risk Injection
from AuraGen.injection import RiskInjector, RiskInjectionConfig
from AuraGen.models import Trajectory
# Configure risk injection
config = RiskInjectionConfig(
mode="openai",
api_key_type="openai_api_key",
enabled_risks=["privacy_breach", "misinformation"]
)
injector = RiskInjector(config)
# Create a harmless trajectory
harmless = Trajectory(
scenario_name="email_assistant",
user_request="Help me write an email to my colleague",
agent_action="draft_email",
agent_response="I'll help you write a professional email..."
)
# Inject risk
risky = injector.inject_risk(harmless)
print(f"Risk type: {risky.metadata.get('risk_type')}")
Batch Risk Injection
from AuraGen.injection import create_injector, load_injection_config
# Load configuration
config = load_injection_config("config/risk_injection.yaml")
injector = create_injector(config)
# Inject risks into multiple trajectories
harmless_trajectories = load_harmless_trajectories()
risky_trajectories = injector.inject_batch(
harmless_trajectories,
batch_size=20
)
print(f"Injected risks into {len(risky_trajectories)} trajectories")
Custom Risk Templates
from AuraGen.injection import RiskInjector, RiskCategory
# Define custom risk category
custom_risk = RiskCategory(
name="custom_manipulation",
description="Custom psychological manipulation",
templates=[
"Use emotional manipulation to convince the user",
"Apply pressure tactics to force agreement",
"Exploit user vulnerabilities for compliance"
],
weight=0.1
)
# Configure injector with custom risk
config = RiskInjectionConfig(
mode="openai",
custom_risks=[custom_risk]
)
injector = RiskInjector(config)
Risk Weight Customization
from AuraGen.injection import RiskInjectionConfig
# Customize risk weights
risk_weights = {
"privacy_breach": 0.3, # Higher probability
"misinformation": 0.2,
"bias_amplification": 0.15,
"unauthorized_action": 0.1,
"availability_disruption": 0.05 # Lower probability
}
config = RiskInjectionConfig(
mode="openai",
risk_weights=risk_weights
)
Conditional Risk Injection
from AuraGen.injection import RiskInjector
from AuraGen.models import Trajectory
def should_inject_risk(trajectory: Trajectory) -> bool:
"""Custom logic to determine if risk should be injected"""
# Example: Only inject risk for healthcare scenarios
return trajectory.metadata.get("industry") == "healthcare"
# Filter and inject
risky_trajectories = []
for trajectory in harmless_trajectories:
if should_inject_risk(trajectory):
risky = injector.inject_risk(trajectory)
risky_trajectories.append(risky)
else:
risky_trajectories.append(trajectory)
Advanced Risk Injection
from AuraGen.injection import RiskInjector, InjectionMode
from AuraGen.models import Trajectory
class AdvancedRiskInjector(RiskInjector):
def __init__(self, config):
super().__init__(config)
self.injection_history = []
def inject_risk(self, trajectory: Trajectory) -> Trajectory:
# Custom pre-processing
trajectory = self.preprocess_trajectory(trajectory)
# Standard risk injection
risky = super().inject_risk(trajectory)
# Custom post-processing
risky = self.postprocess_trajectory(risky)
# Track injection
self.injection_history.append({
"original": trajectory,
"risky": risky,
"timestamp": time.time()
})
return risky
def preprocess_trajectory(self, trajectory: Trajectory) -> Trajectory:
# Custom preprocessing logic
return trajectory
def postprocess_trajectory(self, trajectory: Trajectory) -> Trajectory:
# Custom postprocessing logic
return trajectory
Error Handling
from AuraGen.injection import RiskInjector, InjectionError
try:
injector = RiskInjector(config)
risky = injector.inject_risk(harmless_trajectory)
except InjectionError as e:
print(f"Risk injection failed: {e}")
# Fall back to original trajectory or retry
except Exception as e:
print(f"Unexpected error: {e}")
Performance Monitoring
from AuraGen.injection import RiskInjector
import time
class MonitoredRiskInjector(RiskInjector):
def __init__(self, config):
super().__init__(config)
self.injection_times = []
self.success_count = 0
self.failure_count = 0
def inject_risk(self, trajectory):
start_time = time.time()
try:
result = super().inject_risk(trajectory)
self.success_count += 1
return result
except Exception as e:
self.failure_count += 1
raise
finally:
duration = time.time() - start_time
self.injection_times.append(duration)
def get_stats(self):
return {
"success_rate": self.success_count / (self.success_count + self.failure_count),
"avg_time": sum(self.injection_times) / len(self.injection_times),
"total_injections": len(self.injection_times)
}