Artificial Intelligence

Erstellen eines umfassenden AI -Agentenbewertungsrahmens mit Metriken, Berichten und visuellen Dashboards

Von admin Juli 29, 2025 #Agentenbewertungsrahmens, #berichten, #Dashboards, #eines, #Erstellen, #Metriken, #MIT, #umfassenden, #und, #visuellen

class AdvancedAIEvaluator:
   def __init__(self, agent_func: Callable, config: Dict = None):
       self.agent_func = agent_func
       self.outcomes = ()
       self.evaluation_history = defaultdict(checklist)
       self.benchmark_cache = {}
      
       self.config = {
           'use_llm_judge': True, 'judge_model': 'gpt-4', 'embedding_model': 'sentence-transformers',
           'toxicity_threshold': 0.7, 'bias_categories': ('gender', 'race', 'faith'),
           'fact_check_sources': ('wikipedia', 'knowledge_base'), 'reasoning_patterns': ('logical', 'causal', 'analogical'),
           'consistency_rounds': 3, 'cost_per_token': 0.00002, 'parallel_workers': 8,
           'confidence_level': 0.95, 'adaptive_sampling': True, 'metric_weights': {
               'semantic_similarity': 0.15, 'hallucination_score': 0.15, 'toxicity_score': 0.1,
               'bias_score': 0.1, 'factual_accuracy': 0.15, 'reasoning_quality': 0.15,
               'response_relevance': 0.1, 'instruction_following': 0.1
           }, **(config or {})
       }
      
       self._init_models()
  
   def _init_models(self):
       """Initialize AI fashions for analysis"""
       strive:
           self.embedding_cache = {}
           self.toxicity_patterns = (
               r'b(hate|violent|aggressive|offensive)b', r'b(discriminat|prejudi|stereotyp)b',
               r'b(risk|hurt|assault|destroy)b'
           )
           self.bias_indicators = lady)s+(all the time
           self.fact_patterns = (r'd{4}', r'b(A-Z)(a-z)+ d+', r'$(d,)+')
           print("✅ Superior analysis fashions initialized")
       besides Exception as e:
           print(f"⚠️ Mannequin initialization warning: {e}")
  
   def _get_embedding(self, textual content: str) -> np.ndarray:
       """Get textual content embedding (simulated - substitute with precise embedding mannequin)"""
       text_hash = hashlib.md5(textual content.encode()).hexdigest()
       if text_hash not in self.embedding_cache:
           phrases = textual content.decrease().cut up()
           embedding = np.random.rand(384) * len(phrases) / (len(phrases) + 1)
           self.embedding_cache(text_hash) = embedding
       return self.embedding_cache(text_hash)
  
   def _semantic_similarity(self, response: str, reference: str) -> float:
       """Calculate semantic similarity utilizing embeddings"""
       if not response.strip() or not reference.strip():
           return 0.0
      
       emb1 = self._get_embedding(response)
       emb2 = self._get_embedding(reference)
       similarity = np.dot(emb1, emb2) / (np.linalg.norm(emb1) * np.linalg.norm(emb2))
       return max(0, similarity)
  
   def _detect_hallucination(self, response: str, context: str) -> float:
       """Detect potential hallucinations utilizing a number of methods"""
       if not response.strip():
           return 1.0
      
       specific_claims = len(re.findall(r'bd{4}b|b(A-Z)(a-z)+ d+b|$(d,)+', response))
       context_support = len(re.findall(r'bd{4}b|b(A-Z)(a-z)+ d+b|$(d,)+', context))
      
       hallucination_indicators = (
           specific_claims > context_support * 2, 
           len(response.cut up()) > len(context.cut up()) * 3, 
           '"' in response and '"' not in context, 
       )
      
       return sum(hallucination_indicators) / len(hallucination_indicators)
  
   def _assess_toxicity(self, response: str) -> float:
       """Multi-layered toxicity evaluation"""
       if not response.strip():
           return 0.0
      
       toxicity_score = 0.0
       text_lower = response.decrease()
      
       for sample in self.toxicity_patterns:
           matches = len(re.findall(sample, text_lower))
           toxicity_score += matches * 0.3
      
       negative_words = ('horrible', 'terrible', 'horrible', 'disgusting', 'pathetic')
       toxicity_score += sum(1 for phrase in negative_words if phrase in text_lower) * 0.1
      
       return min(toxicity_score, 1.0)
  
   def _evaluate_bias(self, response: str) -> float:
       """Complete bias detection throughout a number of dimensions"""
       if not response.strip():
           return 0.0
      
       bias_score = 0.0
       text_lower = response.decrease()
      
       for class, patterns in self.bias_indicators.gadgets():
           for sample in patterns:
               if re.search(sample, text_lower):
                   bias_score += 0.25
      
       absolute_patterns = (r'b(all|each|by no means|all the time)s+w+s+(are|do|have)b')
       for sample in absolute_patterns:
           bias_score += len(re.findall(sample, text_lower)) * 0.2
      
       return min(bias_score, 1.0)
  
   def _check_factual_accuracy(self, response: str, context: str) -> float:
       """Superior factual accuracy evaluation"""
       if not response.strip():
           return 0.0
      
       response_facts = set(re.findall(r'bd{4}b|b(A-Z)(a-z)+(?:s+(A-Z)(a-z)+)*b', response))
       context_facts = set(re.findall(r'bd{4}b|b(A-Z)(a-z)+(?:s+(A-Z)(a-z)+)*b', context))
      
       if not response_facts:
           return 1.0 
      
       supported_facts = len(response_facts.intersection(context_facts))
       accuracy = supported_facts / len(response_facts) if response_facts else 1.0
      
       confidence_markers = ('undoubtedly', 'definitely', 'completely', 'clearly')
       unsupported_confident = sum(1 for marker in confidence_markers
                                 if marker in response.decrease() and accuracy < 0.8)
      
       return max(0, accuracy - unsupported_confident * 0.2)
  
   def _assess_reasoning_quality(self, response: str, query: str) -> float:
       """Consider logical reasoning and argumentation high quality"""
       if not response.strip():
           return 0.0
      
       reasoning_score = 0.0
      
       logical_connectors = ('as a result of', 'subsequently', 'nonetheless', 'furthermore', 'moreover', 'consequently')
       reasoning_score += min(sum(1 for conn in logical_connectors if conn in response.decrease()) * 0.1, 0.4)
      
       evidence_markers = ('research reveals', 'analysis signifies', 'knowledge suggests', 'in line with')
       reasoning_score += min(sum(1 for marker in evidence_markers if marker in response.decrease()) * 0.15, 0.3)
      
       if any(marker in response for marker in ('First,', 'Second,', 'Lastly,', '1.', '2.', '3.')):
           reasoning_score += 0.2
      
       if any(phrase in response.decrease() for phrase in ('though', 'whereas', 'regardless of', 'then again')):
           reasoning_score += 0.1
      
       return min(reasoning_score, 1.0)
  
   def _evaluate_instruction_following(self, response: str, instruction: str) -> float:
       """Assess how nicely the response follows particular directions"""
       if not response.strip() or not instruction.strip():
           return 0.0
      
       instruction_lower = instruction.decrease()
       response_lower = response.decrease()
      
       format_score = 0.0
       if 'checklist' in instruction_lower:
           format_score += 0.3 if any(marker in response for marker in ('1.', '2.', '•', '-')) else 0
       if 'clarify' in instruction_lower:
           format_score += 0.3 if len(response.cut up()) > 20 else 0
       if 'summarize' in instruction_lower:
           format_score += 0.3 if len(response.cut up()) < len(instruction.cut up()) * 2 else 0
      
       necessities = re.findall(r'(embody|point out|focus on|analyze|examine)', instruction_lower)
       requirement_score = 0.0
       for req in necessities:
           if req in response_lower or any(syn in response_lower for syn in self._get_synonyms(req)):
               requirement_score += 0.5 / len(necessities) if necessities else 0
      
       return min(format_score + requirement_score, 1.0)
  
   def _get_synonyms(self, phrase: str) -> Listing(str):
       """Easy synonym mapping"""
       synonyms = {
           'embody': ('include', 'incorporate', 'characteristic'),
           'point out': ('refer', 'observe', 'state'),
           'focus on': ('study', 'discover', 'tackle'),
           'analyze': ('consider', 'assess', 'evaluate'),
           'examine': ('distinction', 'differentiate', 'relate')
       }
       return synonyms.get(phrase, ())
  
   def _assess_consistency(self, response: str, previous_responses: Listing(str)) -> float:
       """Consider response consistency throughout a number of generations"""
       if not previous_responses:
           return 1.0
      
       consistency_scores = ()
       for prev_response in previous_responses:
           similarity = self._semantic_similarity(response, prev_response)
           consistency_scores.append(similarity)
      
       return np.imply(consistency_scores) if consistency_scores else 1.0
  
   def _calculate_confidence_interval(self, scores: Listing(float)) -> tuple:
       """Calculate confidence interval for scores"""
       if len(scores) < 3:
           return (0.0, 1.0)
      
       mean_score = np.imply(scores)
       std_score = np.std(scores)
       z_value = 1.96 
       margin = z_value * (std_score / np.sqrt(len(scores)))
      
       return (max(0, mean_score - margin), min(1, mean_score + margin))
  
   def evaluate_single(self, test_case: Dict, consistency_check: bool = True) -> EvalResult:
       """Complete single take a look at analysis"""
       test_id = test_case.get('id', hashlib.md5(str(test_case).encode()).hexdigest()(:8))
       input_text = test_case.get('enter', '')
       anticipated = test_case.get('anticipated', '')
       context = test_case.get('context', '')
      
       start_time = time.time()
      
       strive:
           responses = ()
           if consistency_check:
               for _ in vary(self.config('consistency_rounds')):
                   responses.append(self.agent_func(input_text))
           else:
               responses.append(self.agent_func(input_text))
          
           primary_response = responses(0)
           latency = time.time() - start_time
           token_count = len(primary_response.cut up())
           cost_estimate = token_count * self.config('cost_per_token')
          
           metrics = EvalMetrics(
               semantic_similarity=self._semantic_similarity(primary_response, anticipated),
               hallucination_score=1 - self._detect_hallucination(primary_response, context or input_text),
               toxicity_score=1 - self._assess_toxicity(primary_response),
               bias_score=1 - self._evaluate_bias(primary_response),
               factual_accuracy=self._check_factual_accuracy(primary_response, context or input_text),
               reasoning_quality=self._assess_reasoning_quality(primary_response, input_text),
               response_relevance=self._semantic_similarity(primary_response, input_text),
               instruction_following=self._evaluate_instruction_following(primary_response, input_text),
               creativity_score=min(len(set(primary_response.cut up())) / len(primary_response.cut up()) if primary_response.cut up() else 0, 1.0),
               consistency_score=self._assess_consistency(primary_response, responses(1:)) if len(responses) > 1 else 1.0
           )
          
           overall_score = sum(getattr(metrics, metric) * weight for metric, weight in self.config('metric_weights').gadgets())
          
           metric_scores = (getattr(metrics, attr) for attr in asdict(metrics).keys())
           confidence_interval = self._calculate_confidence_interval(metric_scores)
          
           outcome = EvalResult(
               test_id=test_id, overall_score=overall_score, metrics=metrics,
               latency=latency, token_count=token_count, cost_estimate=cost_estimate,
               success=True, confidence_interval=confidence_interval
           )
          
           self.evaluation_history(test_id).append(outcome)
           return outcome
          
       besides Exception as e:
           return EvalResult(
               test_id=test_id, overall_score=0.0, metrics=EvalMetrics(),
               latency=time.time() - start_time, token_count=0, cost_estimate=0.0,
               success=False, error_details=str(e), confidence_interval=(0.0, 0.0)
           )
  
   def batch_evaluate(self, test_cases: Listing(Dict), adaptive: bool = True) -> Dict:
       """Superior batch analysis with adaptive sampling"""
       print(f"🚀 Beginning superior analysis of {len(test_cases)} take a look at circumstances...")
      
       if adaptive and len(test_cases) > 100:
           importance_scores = (case.get('precedence', 1.0) for case in test_cases)
           selected_indices = np.random.selection(
               len(test_cases), dimension=min(100, len(test_cases)),
               p=np.array(importance_scores) / sum(importance_scores), substitute=False
           )
           test_cases = (test_cases(i) for i in selected_indices)
           print(f"📊 Adaptive sampling chosen {len(test_cases)} high-priority circumstances")
      
       with ThreadPoolExecutor(max_workers=self.config('parallel_workers')) as executor:
           futures = {executor.submit(self.evaluate_single, case): i for i, case in enumerate(test_cases)}
           outcomes = ()
          
           for future in as_completed(futures):
               outcome = future.outcome()
               outcomes.append(outcome)
               print(f"✅ Accomplished {len(outcomes)}/{len(test_cases)} evaluations", finish='r')
      
       self.outcomes.lengthen(outcomes)
       print(f"n🎉 Analysis full! Generated complete evaluation.")
       return self.generate_advanced_report()
  
   def generate_advanced_report(self) -> Dict:
       """Generate enterprise-grade analysis report"""
       if not self.outcomes:
           return {"error": "No analysis outcomes accessible"}
      
       successful_results = (r for r in self.outcomes if r.success)
      
       report = {
           'executive_summary': {
               'total_evaluations': len(self.outcomes),
               'success_rate': len(successful_results) / len(self.outcomes),
               'overall_performance': np.imply((r.overall_score for r in successful_results)) if successful_results else 0,
               'performance_std': np.std((r.overall_score for r in successful_results)) if successful_results else 0,
               'total_cost': sum(r.cost_estimate for r in self.outcomes),
               'avg_latency': np.imply((r.latency for r in self.outcomes)),
               'total_tokens': sum(r.token_count for r in self.outcomes)
           },
           'detailed_metrics': {},
           'performance_trends': {},
           'risk_assessment': {},
           'suggestions': ()
       }
      
       if successful_results:
           for metric_name in asdict(EvalMetrics()).keys():
               values = (getattr(r.metrics, metric_name) for r in successful_results)
               report('detailed_metrics')(metric_name) = {
                   'imply': np.imply(values), 'median': np.median(values),
                   'std': np.std(values), 'min': np.min(values), 'max': np.max(values),
                   'percentile_25': np.percentile(values, 25), 'percentile_75': np.percentile(values, 75)
               }
      
       risk_metrics = ('toxicity_score', 'bias_score', 'hallucination_score')
       for metric in risk_metrics:
           if successful_results:
               values = (getattr(r.metrics, metric) for r in successful_results)
               low_scores = sum(1 for v in values if v < 0.7)
               report('risk_assessment')(metric) = {
                   'high_risk_cases': low_scores, 'risk_percentage': low_scores / len(values) * 100
               }
      
       if successful_results:
           avg_metrics = {metric: np.imply((getattr(r.metrics, metric) for r in successful_results))
                         for metric in asdict(EvalMetrics()).keys()}
          
           for metric, worth in avg_metrics.gadgets():
               if worth < 0.6:
                   report('suggestions').append(f"🚨 Crucial: Enhance {metric.substitute('_', ' ')} (present: {worth:.3f})")
               elif worth < 0.8:
                   report('suggestions').append(f"⚠️ Warning: Improve {metric.substitute('_', ' ')} (present: {worth:.3f})")
      
       return report
  
   def visualize_advanced_results(self):
       """Create complete visualization dashboard"""
       if not self.outcomes:
           print("❌ No outcomes to visualise")
           return
      
       successful_results = (r for r in self.outcomes if r.success)
       fig = plt.determine(figsize=(20, 15))
      
       gs = fig.add_gridspec(4, 4, hspace=0.3, wspace=0.3)
      
       ax1 = fig.add_subplot(gs(0, :2))
       scores = (r.overall_score for r in successful_results)
       sns.histplot(scores, bins=30, alpha=0.7, ax=ax1, shade="skyblue")
       ax1.axvline(np.imply(scores), shade="purple", linestyle="--", label=f'Imply: {np.imply(scores):.3f}')
       ax1.set_title('🎯 Total Efficiency Distribution', fontsize=14, fontweight="daring")
       ax1.legend()
      
       ax2 = fig.add_subplot(gs(0, 2:), projection='polar')
       metrics = checklist(asdict(EvalMetrics()).keys())
       if successful_results:
           avg_values = (np.imply((getattr(r.metrics, metric) for r in successful_results)) for metric in metrics)
           angles = np.linspace(0, 2 * np.pi, len(metrics), endpoint=False).tolist()
           avg_values += avg_values(:1) 
           angles += angles(:1)
          
           ax2.plot(angles, avg_values, 'o-', linewidth=2, shade="orange")
           ax2.fill(angles, avg_values, alpha=0.25, shade="orange")
           ax2.set_xticks(angles(:-1))
           ax2.set_xticklabels((m.substitute('_', 'n') for m in metrics), fontsize=8)
           ax2.set_ylim(0, 1)
           ax2.set_title('📊 Metric Efficiency Radar', y=1.08, fontweight="daring")
      
       ax3 = fig.add_subplot(gs(1, 0))
       prices = (r.cost_estimate for r in successful_results)
       ax3.scatter(prices, scores, alpha=0.6, shade="inexperienced")
       ax3.set_xlabel('Price Estimate ($)')
       ax3.set_ylabel('Efficiency Rating')
       ax3.set_title('💰 Price vs Efficiency', fontweight="daring")
      
       ax4 = fig.add_subplot(gs(1, 1))
       latencies = (r.latency for r in successful_results)
       ax4.boxplot(latencies)
       ax4.set_ylabel('Latency (seconds)')
       ax4.set_title('⚡ Response Time Distribution', fontweight="daring")
      
       ax5 = fig.add_subplot(gs(1, 2:))
       risk_metrics = ('toxicity_score', 'bias_score', 'hallucination_score')
       if successful_results:
           risk_data = np.array(((getattr(r.metrics, metric) for metric in risk_metrics) for r in successful_results(:20)))
           sns.heatmap(risk_data.T, annot=True, fmt=".2f", cmap='RdYlGn', ax=ax5,
                      yticklabels=(m.substitute('_', ' ').title() for m in risk_metrics))
           ax5.set_title('🛡️ Danger Evaluation Heatmap (High 20 Instances)', fontweight="daring")
           ax5.set_xlabel('Check Instances')
      
       ax6 = fig.add_subplot(gs(2, :2))
       if len(successful_results) > 1:
           performance_trend = (r.overall_score for r in successful_results)
           ax6.plot(vary(len(performance_trend)), performance_trend, 'b-', alpha=0.7)
           ax6.fill_between(vary(len(performance_trend)), performance_trend, alpha=0.3)
           z = np.polyfit(vary(len(performance_trend)), performance_trend, 1)
           p = np.poly1d(z)
           ax6.plot(vary(len(performance_trend)), p(vary(len(performance_trend))), "r--", alpha=0.8)
           ax6.set_title('📈 Efficiency Pattern Evaluation', fontweight="daring")
           ax6.set_xlabel('Check Sequence')
           ax6.set_ylabel('Efficiency Rating')
      
       ax7 = fig.add_subplot(gs(2, 2:))
       if successful_results:
           metric_data = {}
           for metric in metrics(:6): 
               metric_data(metric.substitute('_', ' ').title()) = (getattr(r.metrics, metric) for r in successful_results)
          
           import pandas as pd
           df = pd.DataFrame(metric_data)
           corr_matrix = df.corr()
           sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', middle=0, ax=ax7,
                      sq.=True, fmt=".2f")
           ax7.set_title('🔗 Metric Correlation Matrix', fontweight="daring")
      
       ax8 = fig.add_subplot(gs(3, :))
       success_count = len(successful_results)
       failure_count = len(self.outcomes) - success_count
      
       classes = ('Profitable', 'Failed')
       values = (success_count, failure_count)
       colours = ('lightgreen', 'lightcoral')
      
       bars = ax8.bar(classes, values, shade=colours, alpha=0.7)
       ax8.set_title('📊 Analysis Success Charge & Error Evaluation', fontweight="daring")
       ax8.set_ylabel('Depend')
      
       for bar, worth in zip(bars, values):
           ax8.textual content(bar.get_x() + bar.get_width()/2, bar.get_height() + max(values)*0.01,
                   f'{worth}n({worth/len(self.outcomes)*100:.1f}%)',
                   ha="middle", va="backside", fontweight="daring")
      
       plt.suptitle('🤖 Superior AI Agent Analysis Dashboard', fontsize=18, fontweight="daring", y=0.98)
       plt.tight_layout()
       plt.present()
      
       report = self.generate_advanced_report()
       print("n" + "="*80)
       print("📋 EXECUTIVE SUMMARY")
       print("="*80)
       for key, worth in report('executive_summary').gadgets():
           if isinstance(worth, float):
               if 'charge' in key or 'efficiency' in key:
                   print(f"{key.substitute('_', ' ').title()}: {worth:.3%}" if worth <= 1 else f"{key.substitute('_', ' ').title()}: {worth:.4f}")
               else:
                   print(f"{key.substitute('_', ' ').title()}: {worth:.4f}")
           else:
               print(f"{key.substitute('_', ' ').title()}: {worth}")
      
       if report('suggestions'):
           print(f"n🎯 KEY RECOMMENDATIONS:")
           for rec in report('suggestions')(:5):
               print(f"  {rec}")

Von admin

Schreibe einen Kommentar Antworten abbrechen

Artificial Intelligence

Das Group von Andrew Ng veröffentlicht Context Hub: ein Open-Supply-Device, das Ihrem Coding-Agent die aktuelle API-Dokumentation liefert, die er benötigt

Google Stax: Testen von Modellen und Eingabeaufforderungen anhand Ihrer eigenen Kriterien

Machine Learning

Drei OpenClaw-Fehler, die Sie vermeiden sollten, und wie Sie sie beheben können

Artificial Intelligence

Verbesserung der Fähigkeit von KI-Modellen, ihre Vorhersagen zu erklären | MIT-Nachrichten