Your weekly dose of actionable cloud wisdom to start the week right
The Problem
Your Azure AI bill has exploded from £200 to £2,000 per month after implementing a chatbot, your Computer Vision API calls are costing more than your entire infrastructure, and you’re getting charged for OpenAI tokens you didn’t even know you were using. Meanwhile, your AI features are popular with users, but finance is questioning whether the business value justifies the rapidly escalating costs.
The Solution
Implement intelligent cost management for Azure Cognitive Services using usage monitoring, request optimization, caching strategies, and tiered service selection. AI costs can be controlled without sacrificing functionality by understanding pricing models, optimizing requests, and choosing the right service tiers for each use case.
Essential Cognitive Services Cost Optimization:
1. Service Tier Optimization and Right-Sizing
{
"cognitive_services_pricing_analysis": {
"text_analytics": {
"free_tier": {
"monthly_limit": "5,000 transactions",
"cost_per_month": "£0",
"best_for": "Development and small-scale testing"
},
"standard_s0": {
"included_transactions": "25,000/month",
"cost_per_1k_additional": "£1.85",
"monthly_base_cost": "£185.00",
"best_for": "Low to medium volume production"
},
"standard_s": {
"cost_per_1k_transactions": "£1.85",
"no_monthly_commitment": true,
"best_for": "Variable or high-volume workloads"
}
},
"computer_vision": {
"free_tier": {
"monthly_limit": "20 transactions/minute, 5,000/month",
"cost_per_month": "£0"
},
"standard_s1": {
"cost_per_1k_transactions": "£0.74",
"included_transactions": "0",
"best_for": "Production workloads with predictable volume"
}
},
"openai_gpt35_turbo": {
"cost_per_1k_input_tokens": "£0.0012",
"cost_per_1k_output_tokens": "£0.0016",
"context_window": "4,096 tokens",
"best_for": "Most conversational AI use cases"
},
"openai_gpt4": {
"cost_per_1k_input_tokens": "£0.024",
"cost_per_1k_output_tokens": "£0.048",
"context_window": "8,192 tokens",
"cost_multiplier": "20x more than GPT-3.5",
"best_for": "Complex reasoning tasks only"
}
}
}
2. Automated Cost Monitoring and Alerting
# Azure Cognitive Services cost monitoring script
import os
import json
from datetime import datetime, timedelta
from azure.identity import DefaultAzureCredential
from azure.mgmt.consumption import ConsumptionManagementClient
from azure.mgmt.monitor import MonitorManagementClient
from azure.mgmt.cognitiveservices import CognitiveServicesManagementClient
class CognitiveServicesCostMonitor:
def __init__(self, subscription_id):
self.subscription_id = subscription_id
self.credential = DefaultAzureCredential()
# Initialize clients
self.consumption_client = ConsumptionManagementClient(
self.credential, subscription_id
)
self.monitor_client = MonitorManagementClient(
self.credential, subscription_id
)
self.cognitive_client = CognitiveServicesManagementClient(
self.credential, subscription_id
)
def analyze_cognitive_services_costs(self, days_back=30):
"""
Analyze Cognitive Services costs and usage patterns
"""
end_date = datetime.utcnow()
start_date = end_date - timedelta(days=days_back)
# Get cost data
try:
usage_details = self.consumption_client.usage_details.list(
scope=f"/subscriptions/{self.subscription_id}",
filter=f"properties/usageStart ge '{start_date.isoformat()}' and properties/usageStart le '{end_date.isoformat()}' and properties/meterCategory eq 'Cognitive Services'"
)
cost_analysis = {}
total_cost = 0
for usage in usage_details:
service_name = usage.product
cost = float(usage.pretax_cost)
quantity = float(usage.quantity)
if service_name not in cost_analysis:
cost_analysis[service_name] = {
'total_cost': 0,
'total_quantity': 0,
'daily_costs': {}
}
cost_analysis[service_name]['total_cost'] += cost
cost_analysis[service_name]['total_quantity'] += quantity
total_cost += cost
# Track daily costs
usage_date = usage.usage_start.strftime('%Y-%m-%d')
if usage_date not in cost_analysis[service_name]['daily_costs']:
cost_analysis[service_name]['daily_costs'][usage_date] = 0
cost_analysis[service_name]['daily_costs'][usage_date] += cost
# Calculate trends and recommendations
recommendations = self.generate_cost_recommendations(cost_analysis, total_cost)
return {
'total_cost': total_cost,
'cost_by_service': cost_analysis,
'recommendations': recommendations,
'period': f"{start_date.strftime('%Y-%m-%d')} to {end_date.strftime('%Y-%m-%d')}"
}
except Exception as e:
print(f"Error analyzing costs: {str(e)}")
return None
def generate_cost_recommendations(self, cost_analysis, total_cost):
"""
Generate cost optimization recommendations
"""
recommendations = []
# Sort services by cost
sorted_services = sorted(
cost_analysis.items(),
key=lambda x: x[1]['total_cost'],
reverse=True
)
for service_name, data in sorted_services[:5]: # Top 5 most expensive services
service_cost = data['total_cost']
cost_percentage = (service_cost / total_cost) * 100
if cost_percentage > 30:
recommendations.append({
'priority': 'HIGH',
'service': service_name,
'issue': f"High cost service (£{service_cost:.2f}, {cost_percentage:.1f}% of total)",
'recommendations': [
"Consider switching to a different pricing tier",
"Implement request caching to reduce API calls",
"Optimize input data to reduce token/transaction counts",
"Review usage patterns for optimization opportunities"
]
})
elif cost_percentage > 15:
recommendations.append({
'priority': 'MEDIUM',
'service': service_name,
'issue': f"Moderate cost service (£{service_cost:.2f}, {cost_percentage:.1f}% of total)",
'recommendations': [
"Monitor usage trends for potential optimization",
"Implement basic caching for repeated requests",
"Consider batch processing for better efficiency"
]
})
# Check for rapid cost growth
for service_name, data in cost_analysis.items():
daily_costs = data['daily_costs']
if len(daily_costs) >= 7: # Need at least a week of data
recent_costs = list(daily_costs.values())[-7:]
early_costs = list(daily_costs.values())[:7]
if early_costs and recent_costs:
recent_avg = sum(recent_costs) / len(recent_costs)
early_avg = sum(early_costs) / len(early_costs)
if recent_avg > early_avg * 1.5: # 50% increase
recommendations.append({
'priority': 'HIGH',
'service': service_name,
'issue': f"Rapid cost growth detected ({((recent_avg/early_avg-1)*100):.0f}% increase)",
'recommendations': [
"Investigate usage spikes and implement rate limiting",
"Set up usage alerts to prevent runaway costs",
"Review application logic for efficiency improvements"
]
})
return recommendations
def setup_cost_alerts(self, resource_group_name, monthly_budget_gbp):
"""
Set up automated cost alerts for Cognitive Services
"""
try:
# Create action group for notifications
action_group_params = {
"location": "Global",
"group_short_name": "CogSvcAlert",
"enabled": True,
"email_receivers": [
{
"name": "AdminEmail",
"email_address": os.environ.get('ADMIN_EMAIL', 'admin@company.com'),
"use_common_alert_schema": True
}
]
}
# Set up budget alert
budget_params = {
"category": "Cost",
"amount": monthly_budget_gbp,
"time_grain": "Monthly",
"time_period": {
"start_date": datetime.utcnow().replace(day=1).isoformat(),
},
"filters": {
"resource_groups": [resource_group_name],
"meters": ["Cognitive Services"]
},
"notifications": {
"actual_80": {
"enabled": True,
"operator": "GreaterThan",
"threshold": 80,
"contact_emails": [os.environ.get('ADMIN_EMAIL', 'admin@company.com')],
"contact_groups": []
},
"forecasted_100": {
"enabled": True,
"operator": "GreaterThan",
"threshold": 100,
"contact_emails": [os.environ.get('ADMIN_EMAIL', 'admin@company.com')],
"contact_groups": []
}
}
}
print("✅ Cost alerts configured successfully")
return True
except Exception as e:
print(f"Error setting up cost alerts: {str(e)}")
return False
# Usage example
monitor = CognitiveServicesCostMonitor("your-subscription-id")
cost_analysis = monitor.analyze_cognitive_services_costs(days_back=30)
if cost_analysis:
print("=== Cognitive Services Cost Analysis ===")
print(f"Total cost: £{cost_analysis['total_cost']:.2f}")
print(f"Analysis period: {cost_analysis['period']}")
print()
print("Top services by cost:")
sorted_services = sorted(
cost_analysis['cost_by_service'].items(),
key=lambda x: x[1]['total_cost'],
reverse=True
)
for service, data in sorted_services[:5]:
print(f" {service}: £{data['total_cost']:.2f}")
print()
print("Recommendations:")
for rec in cost_analysis['recommendations']:
print(f"🔥 {rec['priority']}: {rec['issue']}")
for suggestion in rec['recommendations'][:2]:
print(f" • {suggestion}")
3. Request Optimization and Caching Strategies
# Intelligent caching and request optimization for Cognitive Services
import hashlib
import json
import time
from functools import wraps
from typing import Optional, Dict, Any
import redis
import requests
class CognitiveServicesOptimizer:
def __init__(self, redis_connection_string: str = None):
"""
Initialize the optimizer with optional Redis caching
"""
self.redis_client = None
if redis_connection_string:
try:
self.redis_client = redis.from_url(redis_connection_string)
self.redis_client.ping() # Test connection
except Exception as e:
print(f"Redis connection failed: {e}")
self.redis_client = None
# In-memory cache as fallback
self.memory_cache = {}
self.cache_stats = {
'hits': 0,
'misses': 0,
'savings_gbp': 0.0
}
def cache_request(self, service_name: str, ttl_seconds: int = 3600, cost_per_request: float = 0.001):
"""
Decorator for caching Cognitive Services requests
"""
def decorator(func):
@wraps(func)
def wrapper(*args, **kwargs):
# Create cache key from function arguments
cache_key = self._generate_cache_key(service_name, func.__name__, args, kwargs)
# Try to get from cache
cached_result = self._get_from_cache(cache_key)
if cached_result is not None:
self.cache_stats['hits'] += 1
self.cache_stats['savings_gbp'] += cost_per_request
return cached_result
# Cache miss - make the actual API call
result = func(*args, **kwargs)
self.cache_stats['misses'] += 1
# Store in cache
self._store_in_cache(cache_key, result, ttl_seconds)
return result
return wrapper
return decorator
def _generate_cache_key(self, service_name: str, function_name: str, args, kwargs) -> str:
"""
Generate a unique cache key for the request
"""
# Create a hash of the input parameters
key_data = {
'service': service_name,
'function': function_name,
'args': str(args),
'kwargs': sorted(kwargs.items()) if kwargs else {}
}
key_string = json.dumps(key_data, sort_keys=True)
return f"cognitive_cache:{hashlib.md5(key_string.encode()).hexdigest()}"
def _get_from_cache(self, cache_key: str) -> Optional[Any]:
"""
Retrieve item from cache (Redis or memory)
"""
if self.redis_client:
try:
cached_data = self.redis_client.get(cache_key)
if cached_data:
return json.loads(cached_data.decode('utf-8'))
except Exception as e:
print(f"Redis get error: {e}")
# Fallback to memory cache
if cache_key in self.memory_cache:
cached_item = self.memory_cache[cache_key]
if time.time() < cached_item['expires']:
return cached_item['data']
else:
del self.memory_cache[cache_key]
return None
def _store_in_cache(self, cache_key: str, data: Any, ttl_seconds: int):
"""
Store item in cache (Redis or memory)
"""
if self.redis_client:
try:
self.redis_client.setex(
cache_key,
ttl_seconds,
json.dumps(data, default=str)
)
return
except Exception as e:
print(f"Redis set error: {e}")
# Fallback to memory cache
self.memory_cache[cache_key] = {
'data': data,
'expires': time.time() + ttl_seconds
}
def get_cache_stats(self) -> Dict[str, Any]:
"""
Get caching performance statistics
"""
total_requests = self.cache_stats['hits'] + self.cache_stats['misses']
hit_rate = (self.cache_stats['hits'] / total_requests * 100) if total_requests > 0 else 0
return {
'total_requests': total_requests,
'cache_hits': self.cache_stats['hits'],
'cache_misses': self.cache_stats['misses'],
'hit_rate_percentage': round(hit_rate, 2),
'estimated_savings_gbp': round(self.cache_stats['savings_gbp'], 2),
'monthly_savings_projection_gbp': round(self.cache_stats['savings_gbp'] * 30, 2)
}
# Example usage with Text Analytics
optimizer = CognitiveServicesOptimizer("redis://localhost:6379")
@optimizer.cache_request('text_analytics', ttl_seconds=1800, cost_per_request=0.00185)
def analyze_sentiment(text: str, endpoint: str, key: str):
"""
Analyze sentiment with caching
"""
headers = {
'Ocp-Apim-Subscription-Key': key,
'Content-Type': 'application/json'
}
body = {
'documents': [
{
'id': '1',
'language': 'en',
'text': text
}
]
}
response = requests.post(
f"{endpoint}/text/analytics/v3.1/sentiment",
headers=headers,
json=body
)
return response.json()
# Batch processing for cost efficiency
def analyze_sentiment_batch(texts: list, endpoint: str, key: str, batch_size: int = 10):
"""
Process texts in batches for better cost efficiency
"""
results = []
# Process in batches to maximize API efficiency
for i in range(0, len(texts), batch_size):
batch = texts[i:i + batch_size]
headers = {
'Ocp-Apim-Subscription-Key': key,
'Content-Type': 'application/json'
}
# Format for batch processing
documents = [
{'id': str(idx), 'language': 'en', 'text': text}
for idx, text in enumerate(batch)
]
body = {'documents': documents}
try:
response = requests.post(
f"{endpoint}/text/analytics/v3.1/sentiment",
headers=headers,
json=body
)
if response.status_code == 200:
batch_results = response.json()
results.extend(batch_results.get('documents', []))
else:
print(f"Batch request failed: {response.status_code}")
except Exception as e:
print(f"Error processing batch: {e}")
return results
4. OpenAI Token Optimization Strategies
# OpenAI token optimization for cost control
import tiktoken
import json
from typing import List, Dict, Any
class OpenAITokenOptimizer:
def __init__(self, model_name: str = "gpt-3.5-turbo"):
"""
Initialize with specific model for accurate token counting
"""
self.model_name = model_name
try:
self.encoding = tiktoken.encoding_for_model(model_name)
except KeyError:
# Fallback to cl100k_base encoding for newer models
self.encoding = tiktoken.get_encoding("cl100k_base")
# Token costs (in GBP per 1K tokens)
self.token_costs = {
"gpt-3.5-turbo": {"input": 0.0012, "output": 0.0016},
"gpt-4": {"input": 0.024, "output": 0.048},
"gpt-4-32k": {"input": 0.048, "output": 0.096}
}
def count_tokens(self, text: str) -> int:
"""
Count tokens in text using the model's encoding
"""
return len(self.encoding.encode(text))
def estimate_cost(self, input_text: str, expected_output_tokens: int = 100) -> Dict[str, float]:
"""
Estimate cost for a request
"""
input_tokens = self.count_tokens(input_text)
costs = self.token_costs.get(self.model_name, self.token_costs["gpt-3.5-turbo"])
input_cost = (input_tokens / 1000) * costs["input"]
output_cost = (expected_output_tokens / 1000) * costs["output"]
return {
"input_tokens": input_tokens,
"estimated_output_tokens": expected_output_tokens,
"input_cost_gbp": round(input_cost, 6),
"output_cost_gbp": round(output_cost, 6),
"total_estimated_cost_gbp": round(input_cost + output_cost, 6)
}
def optimize_prompt(self, original_prompt: str, max_tokens: int = 4000) -> Dict[str, Any]:
"""
Optimize prompt to reduce token usage while maintaining effectiveness
"""
current_tokens = self.count_tokens(original_prompt)
if current_tokens <= max_tokens:
return {
"optimized_prompt": original_prompt,
"original_tokens": current_tokens,
"optimized_tokens": current_tokens,
"tokens_saved": 0,
"optimization_applied": "none"
}
# Optimization strategies
optimizations = []
optimized_prompt = original_prompt
# 1. Remove redundant whitespace and formatting
import re
compressed = re.sub(r'\s+', ' ', optimized_prompt.strip())
if len(compressed) < len(optimized_prompt):
optimized_prompt = compressed
optimizations.append("whitespace_compression")
# 2. Replace verbose phrases with concise alternatives
replacements = {
"please provide me with": "provide",
"I would like you to": "please",
"Can you help me": "help me",
"I need you to": "",
"make sure that": "ensure",
"in order to": "to",
"due to the fact that": "because",
"it is important to note that": "note:",
"please be aware that": "note:"
}
for verbose, concise in replacements.items():
if verbose.lower() in optimized_prompt.lower():
optimized_prompt = optimized_prompt.replace(verbose, concise)
optimized_prompt = optimized_prompt.replace(verbose.capitalize(), concise.capitalize())
optimizations.append("phrase_compression")
# 3. Abbreviate common terms
abbreviations = {
"information": "info",
"application": "app",
"configuration": "config",
"documentation": "docs",
"maximum": "max",
"minimum": "min",
"authenticate": "auth",
"database": "db"
}
for full, abbrev in abbreviations.items():
if full in optimized_prompt and len(full) - len(abbrev) > 2:
optimized_prompt = optimized_prompt.replace(full, abbrev)
optimizations.append("abbreviation")
optimized_tokens = self.count_tokens(optimized_prompt)
tokens_saved = current_tokens - optimized_tokens
return {
"optimized_prompt": optimized_prompt,
"original_tokens": current_tokens,
"optimized_tokens": optimized_tokens,
"tokens_saved": tokens_saved,
"optimization_applied": optimizations,
"cost_savings_gbp": (tokens_saved / 1000) * self.token_costs[self.model_name]["input"]
}
def create_efficient_conversation_history(self, messages: List[Dict], max_tokens: int = 3000) -> List[Dict]:
"""
Optimize conversation history to stay within token limits
"""
# Always keep system message and last user message
if not messages:
return messages
system_messages = [msg for msg in messages if msg.get('role') == 'system']
user_messages = [msg for msg in messages if msg.get('role') == 'user']
assistant_messages = [msg for msg in messages if msg.get('role') == 'assistant']
# Start with system message and last user message
essential_messages = []
if system_messages:
essential_messages.extend(system_messages[:1]) # Keep first system message
if user_messages:
essential_messages.append(user_messages[-1]) # Keep last user message
# Calculate remaining token budget
essential_tokens = sum(self.count_tokens(json.dumps(msg)) for msg in essential_messages)
remaining_tokens = max_tokens - essential_tokens
# Add recent conversation pairs (user + assistant) while we have tokens
conversation_pairs = []
for i in range(len(user_messages) - 1, -1, -1): # Go backwards through conversation
if i < len(assistant_messages):
pair_tokens = (
self.count_tokens(json.dumps(user_messages[i])) +
self.count_tokens(json.dumps(assistant_messages[i]))
)
if pair_tokens <= remaining_tokens:
conversation_pairs.insert(0, (user_messages[i], assistant_messages[i]))
remaining_tokens -= pair_tokens
else:
break
# Construct optimized message list
optimized_messages = []
if system_messages:
optimized_messages.extend(system_messages[:1])
# Add conversation pairs
for user_msg, assistant_msg in conversation_pairs[:-1]: # Exclude the last pair
optimized_messages.extend([user_msg, assistant_msg])
# Add the final user message
optimized_messages.append(user_messages[-1])
return optimized_messages
# Example usage
optimizer = OpenAITokenOptimizer("gpt-3.5-turbo")
# Analyze a prompt
original_prompt = """
I would like you to please provide me with a comprehensive analysis of the given text.
Please make sure that you analyze the sentiment, extract key entities, and also provide
a summary. It is important to note that the analysis should be thorough and detailed.
Can you help me with this task? I need you to be as accurate as possible.
"""
optimization_result = optimizer.optimize_prompt(original_prompt)
cost_analysis = optimizer.estimate_cost(optimization_result['optimized_prompt'])
print("=== OpenAI Prompt Optimization ===")
print(f"Original tokens: {optimization_result['original_tokens']}")
print(f"Optimized tokens: {optimization_result['optimized_tokens']}")
print(f"Tokens saved: {optimization_result['tokens_saved']}")
print(f"Cost savings: £{optimization_result.get('cost_savings_gbp', 0):.6f} per request")
print(f"Optimizations applied: {', '.join(optimization_result['optimization_applied'])}")
print()
print("Optimized prompt:")
print(optimization_result['optimized_prompt'])
5. Alternative Service Selection Matrix
# Service selection optimizer based on requirements and cost
class CognitiveServiceSelector:
def __init__(self):
self.services_matrix = {
"text_analysis": {
"azure_text_analytics": {
"cost_per_1k": 1.85, # GBP
"languages": 120,
"features": ["sentiment", "key_phrases", "entities", "language_detection"],
"accuracy": "high",
"latency_ms": 200,
"best_for": "Enterprise applications requiring high accuracy"
},
"azure_openai_gpt35": {
"cost_per_1k_tokens": 1.2, # Input tokens
"languages": 95,
"features": ["sentiment", "summarization", "qa", "classification"],
"accuracy": "very_high",
"latency_ms": 800,
"best_for": "Complex analysis requiring reasoning"
},
"open_source_alternative": {
"cost_per_1k": 0.1, # Self-hosted costs
"languages": 50,
"features": ["sentiment", "basic_entities"],
"accuracy": "medium",
"latency_ms": 100,
"best_for": "High-volume, cost-sensitive applications"
}
},
"image_analysis": {
"azure_computer_vision": {
"cost_per_1k": 0.74,
"features": ["ocr", "object_detection", "face_detection", "brand_detection"],
"accuracy": "high",
"max_image_size_mb": 4,
"latency_ms": 500,
"best_for": "Production image analysis"
},
"azure_custom_vision": {
"cost_per_1k_predictions": 1.48,
"cost_per_hour_training": 14.80,
"features": ["custom_classification", "object_detection"],
"accuracy": "very_high",
"best_for": "Domain-specific image classification"
},
"open_source_alternative": {
"cost_per_1k": 0.05, # Compute costs only
"features": ["basic_classification", "object_detection"],
"accuracy": "medium",
"setup_complexity": "high",
"best_for": "High-volume, budget-constrained scenarios"
}
},
"speech_services": {
"azure_speech_to_text": {
"cost_per_hour": 0.74,
"languages": 85,
"features": ["real_time", "batch", "custom_models"],
"accuracy": "high",
"best_for": "Production speech applications"
},
"whisper_api": {
"cost_per_minute": 0.005,
"languages": 99,
"features": ["transcription", "translation"],
"accuracy": "very_high",
"best_for": "High-accuracy transcription needs"
}
}
}
def recommend_service(self, category: str, requirements: dict) -> dict:
"""
Recommend the best service based on requirements
"""
if category not in self.services_matrix:
return {"error": f"Category '{category}' not supported"}
services = self.services_matrix[category]
scored_services = []
for service_name, service_info in services.items():
score = self._calculate_service_score(service_info, requirements)
scored_services.append({
"service": service_name,
"score": score,
"info": service_info,
"monthly_cost_estimate": self._estimate_monthly_cost(service_info, requirements)
})
# Sort by score (higher is better)
scored_services.sort(key=lambda x: x["score"], reverse=True)
return {
"recommended_service": scored_services[0],
"alternatives": scored_services[1:],
"selection_rationale": self._generate_rationale(scored_services[0], requirements)
}
def _calculate_service_score(self, service_info: dict, requirements: dict) -> float:
"""
Calculate a score for service based on requirements
"""
score = 0.0
# Cost weight (lower cost = higher score)
max_monthly_budget = requirements.get("max_monthly_budget_gbp", 1000)
estimated_cost = self._estimate_monthly_cost(service_info, requirements)
if estimated_cost <= max_monthly_budget:
cost_score = (max_monthly_budget - estimated_cost) / max_monthly_budget * 40
score += cost_score
# Accuracy weight
accuracy_importance = requirements.get("accuracy_importance", 5) # 1-10 scale
accuracy_mapping = {"low": 1, "medium": 5, "high": 8, "very_high": 10}
accuracy_score = accuracy_mapping.get(service_info.get("accuracy", "medium"), 5)
score += (accuracy_score / 10) * accuracy_importance * 4
# Feature match weight
required_features = set(requirements.get("required_features", []))
available_features = set(service_info.get("features", []))
feature_match_ratio = len(required_features & available_features) / len(required_features) if required_features else 1
score += feature_match_ratio * 30
# Latency weight (if specified)
max_latency_ms = requirements.get("max_latency_ms")
if max_latency_ms and "latency_ms" in service_info:
if service_info["latency_ms"] <= max_latency_ms:
latency_score = (max_latency_ms - service_info["latency_ms"]) / max_latency_ms * 20
score += latency_score
return round(score, 2)
def _estimate_monthly_cost(self, service_info: dict, requirements: dict) -> float:
"""
Estimate monthly cost based on expected usage
"""
monthly_volume = requirements.get("monthly_volume", 10000)
if "cost_per_1k" in service_info:
return (monthly_volume / 1000) * service_info["cost_per_1k"]
elif "cost_per_1k_tokens" in service_info:
avg_tokens_per_request = requirements.get("avg_tokens_per_request", 100)
total_tokens = monthly_volume * avg_tokens_per_request
return (total_tokens / 1000) * service_info["cost_per_1k_tokens"]
elif "cost_per_hour" in service_info:
hours_per_month = requirements.get("hours_per_month", 100)
return hours_per_month * service_info["cost_per_hour"]
elif "cost_per_minute" in service_info:
minutes_per_month = requirements.get("minutes_per_month", 1000)
return minutes_per_month * service_info["cost_per_minute"]
return 0.0
def _generate_rationale(self, recommended_service: dict, requirements: dict) -> str:
"""
Generate explanation for service recommendation
"""
service_name = recommended_service["service"]
cost = recommended_service["monthly_cost_estimate"]
rationale = f"{service_name} is recommended based on your requirements. "
rationale += f"Estimated monthly cost: £{cost:.2f}. "
if cost < requirements.get("max_monthly_budget_gbp", 1000) * 0.5:
rationale += "This option provides excellent cost efficiency. "
accuracy = recommended_service["info"].get("accuracy", "medium")
if accuracy in ["high", "very_high"]:
rationale += "High accuracy ensures reliable results. "
return rationale
# Example usage
selector = CognitiveServiceSelector()
# Define requirements for text analysis
requirements = {
"max_monthly_budget_gbp": 500,
"monthly_volume": 50000, # 50k requests per month
"required_features": ["sentiment", "entities"],
"accuracy_importance": 8, # High importance (1-10)
"max_latency_ms": 1000
}
recommendation = selector.recommend_service("text_analysis", requirements)
print("=== Service Recommendation ===")
print(f"Recommended: {recommendation['recommended_service']['service']}")
print(f"Estimated monthly cost: £{recommendation['recommended_service']['monthly_cost_estimate']:.2f}")
print(f"Score: {recommendation['recommended_service']['score']}/100")
print(f"Rationale: {recommendation['selection_rationale']}")
print()
print("Alternatives:")
for alt in recommendation['alternatives'][:2]:
print(f" {alt['service']}: £{alt['monthly_cost_estimate']:.2f}/month (Score: {alt['score']})")
Why It Matters
- Cost Predictability: Control AI spending before it spirals out of control
- ROI Optimization: Ensure AI features deliver business value proportional to cost
- Service Selection: Choose the right AI service tier for each use case
- Token Efficiency: Optimize AI prompts to reduce unnecessary token usage
- Budget Management: Implement proactive monitoring and alerting for AI costs
Try This Week
- Audit current AI spending – Run the cost analysis script on your Cognitive Services
- Implement basic caching – Add caching to your most frequent API calls
- Optimize one OpenAI prompt – Use the token optimizer on your longest prompts
- Set up cost alerts – Configure budget alerts for your AI services
Quick Cognitive Services Cost Assessment
#!/bin/bash
# Cognitive Services cost assessment script
SUBSCRIPTION_ID="your-subscription-id"
RESOURCE_GROUP="your-resource-group"
echo "=== Azure Cognitive Services Cost Assessment ==="
echo
echo "📊 Current Cognitive Services resources:"
az cognitiveservices account list \
--resource-group $RESOURCE_GROUP \
--query '[].{Name:name,Kind:kind,Sku:sku.name,Location:location}' \
--output table
echo
echo "💰 Cost analysis (last 30 days):"
end_date=$(date -u +%Y-%m-%d)
start_date=$(date -u -d '30 days ago' +%Y-%m-%d)
az consumption usage list \
--start-date $start_date \
--end-date $end_date \
--query "[?contains(product,'Cognitive Services')].{Service:product,Cost:pretaxCost,Quantity:quantity}" \
--output table
echo
echo "🔍 Usage patterns by service:"
az monitor metrics list \
--resource-group $RESOURCE_GROUP \
--resource-type "Microsoft.CognitiveServices/accounts" \
--metric "TotalCalls,TotalErrors" \
--start-time $start_date \
--end-time $end_date \
--output table 2>/dev/null || echo "Metrics require additional permissions"
echo
echo "⚠️ High-cost service indicators:"
echo "1. Check for OpenAI GPT-4 usage vs GPT-3.5 needs"
echo "2. Review Text Analytics volume and consider batch processing"
echo "3. Verify Computer Vision image sizes and optimize if needed"
echo "4. Look for unused or redundant Cognitive Services instances"
echo
echo "🎯 Cost optimization recommendations:"
echo "1. Implement caching for repeated requests"
echo "2. Use batch processing where available"
echo "3. Optimize OpenAI prompts to reduce token usage"
echo "4. Consider service tier adjustments based on usage"
echo "5. Set up cost alerts and budgets"
echo "6. Review free tier utilization before upgrading"
Common Cognitive Services Cost Mistakes
- Over-using GPT-4: Using expensive models for tasks GPT-3.5 can handle
- No request caching: Making redundant API calls for identical requests
- Inefficient prompts: Using verbose prompts that waste tokens
- Wrong service tiers: Paying for premium features not being used
- Missing batch processing: Making individual calls instead of batching
- No usage monitoring: Flying blind on actual API consumption patterns
Advanced Cost Optimization Strategies
- Multi-model routing: Route requests to appropriate models based on complexity
- Request deduplication: Identify and eliminate duplicate API calls
- Usage-based alerting: Set up alerts based on API call volumes, not just costs
- A/B testing for models: Compare accuracy vs cost across different service options
Pro Tip: Start cost optimization with OpenAI services first – they often represent 60-80% of Cognitive Services costs and have the highest optimization potential through prompt engineering and model selection.








