BOOMER Evaluation Tutorial
This notebook demonstrates how to evaluate BOOMER's reasoning performance using ground truth data.
Overview
Evaluation in BOOMER helps you: - Measure reasoning accuracy against known facts - Compare different configurations - Find optimal probability thresholds - Identify systematic errors in predictions
We'll cover: 1. Basic evaluation metrics 2. Evaluating reasoning results 3. Type-specific evaluation 4. Grid search with evaluation 5. Analyzing errors
# Import required modules
from boomer.evaluator import evaluate_facts
from boomer.model import KB, SubClassOf, EquivalentTo, DisjointWith
from boomer.reasoners.nx_reasoner import NxReasoner
from boomer.search import solve, grid_search
from boomer.model import SearchConfig, GridSearch
from boomer.datasets import animals, family
import pandas as pd
import matplotlib.pyplot as plt
1. Understanding Evaluation Metrics
BOOMER computes standard classification metrics for fact prediction:
# Create simple ground truth and predictions
ground_truth = [
SubClassOf(sub="Cat", sup="Mammal"),
SubClassOf(sub="Dog", sup="Mammal"),
SubClassOf(sub="Mammal", sup="Animal"),
EquivalentTo(sub="Cat", equivalent="Feline")
]
predictions = [
SubClassOf(sub="Cat", sup="Mammal"), # True Positive
SubClassOf(sub="Dog", sup="Mammal"), # True Positive
SubClassOf(sub="Bird", sup="Animal"), # False Positive
# Missing: Mammal->Animal (False Negative)
# Missing: Cat≡Feline (False Negative)
]
# Evaluate
stats = evaluate_facts(ground_truth, predictions)
print(f"True Positives: {stats.tp}")
print(f"False Positives: {stats.fp}")
print(f"False Negatives: {stats.fn}")
print(f"Precision: {stats.precision:.3f}")
print(f"Recall: {stats.recall:.3f}")
print(f"F1 Score: {stats.f1:.3f}")
2. Evaluating Reasoning Results
Let's evaluate a complete reasoning workflow using a real dataset:
# Load the animals dataset
kb = animals.kb
print(f"Dataset: {kb.name}")
print(f"Facts: {len(kb.facts)}")
print(f"Probabilistic facts: {len(kb.pfacts)}")
# Split into training and test sets
train_facts = kb.facts[:30] # Use first 30 facts for training
test_facts = kb.facts[30:] # Rest for testing
# Also keep some probabilistic facts for hypothesis generation
train_pfacts = kb.pfacts[:20]
print(f"\nTraining facts: {len(train_facts)}")
print(f"Test facts: {len(test_facts)}")
# Create training KB and run reasoning
train_kb = KB(
name="Animals Training",
facts=train_facts,
pfacts=train_pfacts
)
# Configure and run search with timeout
config = SearchConfig(
max_iterations=10000,
max_candidate_solutions=100,
timeout_seconds=60 # Add timeout to prevent infinite execution
)
solution = solve(train_kb, config)
print(f"Solution confidence: {solution.confidence:.3f}")
print(f"Solved probabilistic facts: {len(solution.solved_pfacts)}")
# Extract predictions with different probability thresholds
thresholds = [0.5, 0.7, 0.9]
results = []
for threshold in thresholds:
# Filter predictions by posterior probability
predictions = [
spf.pfact.fact
for spf in solution.solved_pfacts
if spf.truth_value and spf.posterior_prob >= threshold
]
# Evaluate against test set
stats = evaluate_facts(test_facts, predictions)
results.append({
'threshold': threshold,
'predictions': len(predictions),
'precision': stats.precision,
'recall': stats.recall,
'f1': stats.f1,
'tp': stats.tp,
'fp': stats.fp,
'fn': stats.fn
})
# Display results
df = pd.DataFrame(results)
print(df.to_string(index=False))
# Visualize precision-recall trade-off
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
# Precision vs Recall
ax1.plot(df['recall'], df['precision'], 'o-', markersize=8)
for idx, row in df.iterrows():
ax1.annotate(f"t={row['threshold']}",
(row['recall'], row['precision']),
textcoords="offset points", xytext=(5,5))
ax1.set_xlabel('Recall')
ax1.set_ylabel('Precision')
ax1.set_title('Precision-Recall Trade-off')
ax1.grid(True, alpha=0.3)
# F1 Score vs Threshold
ax2.plot(df['threshold'], df['f1'], 's-', markersize=8, color='green')
ax2.set_xlabel('Probability Threshold')
ax2.set_ylabel('F1 Score')
ax2.set_title('F1 Score by Threshold')
ax2.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()
3. Type-Specific Evaluation
Evaluate performance on specific types of facts to understand reasoning capabilities:
# Create a KB with mixed fact types
mixed_kb = KB(
name="Mixed Facts",
facts=[
SubClassOf(sub="A", sup="B"),
SubClassOf(sub="B", sup="C"),
EquivalentTo(sub="X", equivalent="Y")
],
pfacts=[]
)
# Generate some predictions
predicted_facts = [
SubClassOf(sub="A", sup="B"), # Correct
SubClassOf(sub="A", sup="C"), # Correct (inferred)
SubClassOf(sub="B", sup="D"), # Wrong
EquivalentTo(sub="X", equivalent="Y"), # Correct
EquivalentTo(sub="Y", equivalent="Z") # Wrong
]
# Evaluate all facts
all_stats = evaluate_facts(mixed_kb.facts, predicted_facts)
print("All fact types:")
print(f" F1 Score: {all_stats.f1:.3f}")
print(f" Precision: {all_stats.precision:.3f}")
print(f" Recall: {all_stats.recall:.3f}")
# Evaluate only SubClassOf facts
subclass_stats = evaluate_facts(mixed_kb.facts, predicted_facts,
types=["SubClassOf"])
print("\nSubClassOf facts only:")
print(f" F1 Score: {subclass_stats.f1:.3f}")
print(f" Precision: {subclass_stats.precision:.3f}")
print(f" Recall: {subclass_stats.recall:.3f}")
# Evaluate only EquivalentTo facts
equiv_stats = evaluate_facts(mixed_kb.facts, predicted_facts,
types=["EquivalentTo"])
print("\nEquivalentTo facts only:")
print(f" F1 Score: {equiv_stats.f1:.3f}")
print(f" Precision: {equiv_stats.precision:.3f}")
print(f" Recall: {equiv_stats.recall:.3f}")
4. Grid Search with Automatic Evaluation
Find the best configuration by testing multiple parameter combinations:
# Prepare train/test split
family_kb = family.kb
train_kb = KB(
name="Family Training",
facts=family_kb.facts[:15],
pfacts=family_kb.pfacts[:10]
)
test_kb = KB(
name="Family Test",
facts=family_kb.facts[15:]
)
print(f"Training: {len(train_kb.facts)} facts, {len(train_kb.pfacts)} pfacts")
print(f"Testing: {len(test_kb.facts)} facts")
# Define grid search configuration
# Note: GridSearch requires a list of configurations
base_config = SearchConfig(
max_iterations=5000,
max_candidate_solutions=50,
timeout_seconds=30 # Add timeout for each configuration
)
# Create configurations with different parameters
grid_config = GridSearch(
configurations=[base_config],
configuration_matrix={
"max_iterations": [1000, 5000, 10000],
"max_candidate_solutions": [50, 100, 200],
"max_pfacts_per_clique": [50, 100, 1000]
}
)
# Run grid search with evaluation
print("Running grid search (this may take a moment)...")
grid_result = grid_search(train_kb, grid_config, test_kb)
print(f"\nCompleted {len(grid_result.results)} configurations")
# Analyze grid search results
results_data = []
for r in grid_result.results:
if r.evaluation:
results_data.append({
'max_iterations': r.config.max_iterations,
'max_candidate_solutions': r.config.max_candidate_solutions,
'max_pfacts_per_clique': r.config.max_pfacts_per_clique,
'f1': r.evaluation.f1,
'precision': r.evaluation.precision,
'recall': r.evaluation.recall
})
results_df = pd.DataFrame(results_data)
if len(results_df) > 0:
# Find best configuration by F1 score
best_idx = results_df['f1'].idxmax()
best_config = results_df.iloc[best_idx]
print("Best configuration:")
print(f" Max iterations: {best_config['max_iterations']}")
print(f" Max candidate solutions: {best_config['max_candidate_solutions']}")
print(f" Max pfacts per clique: {best_config['max_pfacts_per_clique']}")
print(f" F1 Score: {best_config['f1']:.3f}")
print(f" Precision: {best_config['precision']:.3f}")
print(f" Recall: {best_config['recall']:.3f}")
else:
print("No results with evaluation found")
# Visualize parameter impact on F1 score
if len(results_df) > 0:
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
# Group by max_iterations
iter_group = results_df.groupby('max_iterations')['f1'].mean().reset_index()
axes[0].bar(iter_group['max_iterations'].astype(str), iter_group['f1'])
axes[0].set_xlabel('Max Iterations')
axes[0].set_ylabel('Mean F1 Score')
axes[0].set_title('Impact of Max Iterations on F1')
# Group by max_candidate_solutions
cand_group = results_df.groupby('max_candidate_solutions')['f1'].mean().reset_index()
axes[1].bar(cand_group['max_candidate_solutions'].astype(str), cand_group['f1'])
axes[1].set_xlabel('Max Candidate Solutions')
axes[1].set_ylabel('Mean F1 Score')
axes[1].set_title('Impact of Max Candidates on F1')
# Group by max_pfacts_per_clique
pfact_group = results_df.groupby('max_pfacts_per_clique')['f1'].mean().reset_index()
axes[2].bar(pfact_group['max_pfacts_per_clique'].astype(str), pfact_group['f1'])
axes[2].set_xlabel('Max PFacts per Clique')
axes[2].set_ylabel('Mean F1 Score')
axes[2].set_title('Impact of Clique Size on F1')
plt.tight_layout()
plt.show()
else:
print("No results to visualize")
5. Error Analysis
Examine false positives and false negatives to understand reasoning errors:
# Run evaluation with detailed fact lists
threshold = 0.7
predictions = [
spf.pfact.fact
for spf in solution.solved_pfacts
if spf.truth_value and spf.posterior_prob >= threshold
]
stats = evaluate_facts(test_facts, predictions)
print(f"Evaluation at threshold {threshold}:")
print(f" F1 Score: {stats.f1:.3f}")
print(f" True Positives: {stats.tp}")
print(f" False Positives: {stats.fp}")
print(f" False Negatives: {stats.fn}")
# Analyze error patterns
if stats.fp_list:
fp_types = {}
for fact in stats.fp_list:
fact_type = type(fact).__name__
fp_types[fact_type] = fp_types.get(fact_type, 0) + 1
print("False positives by type:")
for fact_type, count in sorted(fp_types.items()):
print(f" {fact_type}: {count}")
if stats.fn_list:
fn_types = {}
for fact in stats.fn_list:
fact_type = type(fact).__name__
fn_types[fact_type] = fn_types.get(fact_type, 0) + 1
print("\nFalse negatives by type:")
for fact_type, count in sorted(fn_types.items()):
print(f" {fact_type}: {count}")
6. Creating Custom Evaluation Datasets
Build evaluation datasets with known ground truth:
# Create a custom evaluation dataset
def create_hierarchy_test():
"""Create a test dataset with a clear hierarchy."""
# Define ground truth hierarchy
ground_truth = [
# Direct assertions
SubClassOf(sub="Siamese", sup="Cat"),
SubClassOf(sub="Persian", sup="Cat"),
SubClassOf(sub="Cat", sup="Feline"),
SubClassOf(sub="Feline", sup="Mammal"),
SubClassOf(sub="Mammal", sup="Animal"),
# Transitive relations that should be inferred
SubClassOf(sub="Siamese", sup="Feline"),
SubClassOf(sub="Siamese", sup="Mammal"),
SubClassOf(sub="Siamese", sup="Animal"),
SubClassOf(sub="Cat", sup="Animal"),
# Equivalences
EquivalentTo(sub="Feline", equivalent="FelidaeFamily")
]
# Training set (partial information)
training_facts = [
SubClassOf(sub="Siamese", sup="Cat"),
SubClassOf(sub="Persian", sup="Cat"),
SubClassOf(sub="Cat", sup="Feline"),
SubClassOf(sub="Feline", sup="Mammal"),
SubClassOf(sub="Mammal", sup="Animal"),
EquivalentTo(sub="Feline", equivalent="FelidaeFamily")
]
return ground_truth, training_facts
# Create the test dataset
ground_truth, training_facts = create_hierarchy_test()
print(f"Ground truth facts: {len(ground_truth)}")
print(f"Training facts: {len(training_facts)}")
print(f"Facts to be inferred: {len(ground_truth) - len(training_facts)}")
# Test reasoning on custom dataset
custom_kb = KB(
name="Custom Hierarchy",
facts=training_facts,
pfacts=[] # No probabilistic facts for this test
)
# Use reasoner to infer facts
reasoner = NxReasoner()
reasoning_solution = reasoner.reason(custom_kb)
# Extract all inferred facts
inferred_facts = [h.fact for h in reasoning_solution.entailed_hypotheses
if h.truth_value]
print(f"Inferred {len(inferred_facts)} facts")
# Evaluate
eval_stats = evaluate_facts(ground_truth, inferred_facts)
print(f"\nEvaluation results:")
print(f" Precision: {eval_stats.precision:.3f}")
print(f" Recall: {eval_stats.recall:.3f}")
print(f" F1 Score: {eval_stats.f1:.3f}")
if eval_stats.recall < 1.0 and eval_stats.fn_list:
print(f"\nMissed facts:")
for fact in eval_stats.fn_list:
print(f" - {fact}")
Summary
This tutorial covered:
- Basic Metrics: Understanding precision, recall, and F1 scores
- Threshold Tuning: Finding optimal probability thresholds
- Type-Specific Evaluation: Measuring performance on different fact types
- Grid Search: Finding best configurations automatically
- Error Analysis: Understanding false positives and negatives
- Custom Datasets: Creating targeted evaluation scenarios
Key Takeaways
- Precision vs Recall: Higher thresholds increase precision but decrease recall
- F1 Score: Balances precision and recall for overall performance
- Grid Search: Automates finding optimal parameters
- Error Analysis: Helps identify systematic reasoning issues
- Type-Specific: Different fact types may require different strategies
Next Steps
- Try evaluation with your own datasets
- Experiment with different reasoning configurations
- Use grid search to optimize for specific metrics
- Analyze errors to improve hypothesis generation