Idempotency & Retry Mechanisms
Overview
Idempotency ensures that onboarding operations can be safely retried without causing duplicate or inconsistent state. This is critical for reliability in distributed systems where failures are inevitable.
Idempotency Principles
1. Idempotent Key (Fingerprinting)
// src/onboarding/idempotency/fingerprint.ts
export class OnboardingFingerprint {
/**
* Generate unique fingerprint for onboarding request
*/
generate(context: OnboardingContext): string {
const components = [
context.tenantId,
context.repoUrl,
context.workspaceId || 'no-workspace',
];
// Hash components for consistent fingerprint
return crypto
.createHash('sha256')
.update(components.join(':'))
.digest('hex');
}
/**
* Check if onboarding with this fingerprint already exists
*/
async findExisting(fingerprint: string): Promise<OnboardingRecord | null> {
return await db('onboarding_history')
.where('fingerprint', fingerprint)
.orderBy('created_at', 'desc')
.first();
}
/**
* Idempotent onboarding trigger
*/
async triggerIdempotent(context: OnboardingContext): Promise<OnboardingResult> {
const fingerprint = this.generate(context);
// Check for existing onboarding
const existing = await this.findExisting(fingerprint);
if (existing) {
// Already completed successfully
if (existing.status === 'completed') {
return {
onboardingId: existing.id,
status: 'already_completed',
entities: existing.entities_created,
message: 'Onboarding already completed for this business unit',
};
}
// In progress
if (existing.status === 'in_progress') {
return {
onboardingId: existing.id,
status: 'in_progress',
currentState: existing.current_state,
message: 'Onboarding already in progress',
};
}
// Failed - check if we can retry
if (existing.status === 'failed') {
const canRetry = await this.canRetryFailure(existing);
if (!canRetry) {
throw new Error('Recent onboarding failure, wait before retry');
}
// Resume from failure
return await this.resumeFromFailure(existing);
}
}
// New onboarding
return await this.startNewOnboarding(context, fingerprint);
}
private async canRetryFailure(existing: OnboardingRecord): Promise<boolean> {
// Wait at least 1 hour before retry
const hoursSinceFailure = (Date.now() - existing.failed_at.getTime()) / 3600000;
return hoursSinceFailure >= 1;
}
}
Retry Strategies
1. Exponential Backoff
// src/onboarding/retry/exponential-backoff.ts
export class ExponentialBackoff {
constructor(
private baseDelayMs: number = 1000,
private maxDelayMs: number = 60000,
private maxAttempts: number = 5
) {}
/**
* Execute operation with exponential backoff
*/
async execute<T>(
operation: () => Promise<T>,
context: { name: string; onboardingId: string }
): Promise<T> {
let lastError: Error;
for (let attempt = 1; attempt <= this.maxAttempts; attempt++) {
try {
console.log(`${context.name}: Attempt ${attempt}/${this.maxAttempts}`);
const result = await operation();
// Success
if (attempt > 1) {
console.log(`${context.name}: Succeeded on attempt ${attempt}`);
}
return result;
} catch (error) {
lastError = error;
// Check if error is retryable
if (!this.isRetryable(error)) {
console.error(`${context.name}: Non-retryable error:`, error);
throw error;
}
// Last attempt - don't delay
if (attempt === this.maxAttempts) {
console.error(`${context.name}: All ${this.maxAttempts} attempts failed`);
break;
}
// Calculate delay with jitter
const delay = this.calculateDelay(attempt);
console.warn(`${context.name}: Attempt ${attempt} failed, retrying in ${delay}ms`, error);
// Wait before retry
await new Promise(resolve => setTimeout(resolve, delay));
}
}
// All attempts failed
throw new RetryExhaustedError(
`${context.name} failed after ${this.maxAttempts} attempts`,
{ lastError, attempts: this.maxAttempts }
);
}
private calculateDelay(attempt: number): number {
// Exponential: 1s, 2s, 4s, 8s, 16s
const exponentialDelay = this.baseDelayMs * Math.pow(2, attempt - 1);
// Cap at max delay
const cappedDelay = Math.min(exponentialDelay, this.maxDelayMs);
// Add jitter (random ±25%)
const jitter = cappedDelay * 0.25 * (Math.random() - 0.5);
return Math.floor(cappedDelay + jitter);
}
private isRetryable(error: Error): boolean {
// Retryable errors
const retryableCodes = [
'ECONNRESET',
'ETIMEDOUT',
'ENOTFOUND',
'ENETUNREACH',
'EAI_AGAIN',
];
if (retryableCodes.includes(error['code'])) {
return true;
}
// HTTP errors
if (error['status']) {
const status = error['status'];
// 5xx errors are retryable
if (status >= 500 && status < 600) {
return true;
}
// 429 Too Many Requests is retryable
if (status === 429) {
return true;
}
// 408 Request Timeout is retryable
if (status === 408) {
return true;
}
}
// Rate limit errors
if (error.message?.includes('rate limit')) {
return true;
}
// Temporary unavailability
if (error.message?.includes('temporarily unavailable')) {
return true;
}
// Non-retryable (validation errors, authorization, etc.)
return false;
}
}
2. Per-State Retry Configuration
// src/onboarding/retry/state-retry-config.ts
export const STATE_RETRY_CONFIG: Record<OnboardingState, RetryConfig> = {
RECEIVED: {
maxAttempts: 1, // No retry
timeout: 1000,
},
VALIDATING: {
maxAttempts: 1, // No retry for validation
timeout: 10000,
},
DISCOVERING: {
maxAttempts: 3,
baseDelay: 2000,
maxDelay: 30000,
timeout: 60000,
},
EXTRACTING: {
maxAttempts: 3,
baseDelay: 1000,
maxDelay: 15000,
timeout: 30000,
},
GENERATING: {
maxAttempts: 3,
baseDelay: 1000,
maxDelay: 10000,
timeout: 20000,
},
REGISTERING: {
maxAttempts: 5, // Critical, retry more
baseDelay: 2000,
maxDelay: 60000,
timeout: 30000,
},
CONFIGURING_SYNC: {
maxAttempts: 3,
baseDelay: 2000,
maxDelay: 30000,
timeout: 30000,
},
NOTIFYING: {
maxAttempts: 2, // Non-critical, limited retry
baseDelay: 1000,
maxDelay: 5000,
timeout: 10000,
},
ROLLING_BACK: {
maxAttempts: 5, // Critical cleanup
baseDelay: 3000,
maxDelay: 60000,
timeout: 60000,
},
};
Partial Onboarding Recovery
1. State Persistence
// src/onboarding/recovery/state-persistence.ts
export class OnboardingStatePersistence {
/**
* Save current state and progress
*/
async saveCheckpoint(
onboardingId: string,
state: OnboardingState,
context: OnboardingContext
): Promise<void> {
await db('onboarding_state').upsert({
onboarding_id: onboardingId,
current_state: state,
context: JSON.stringify(context),
last_checkpoint: new Date(),
updated_at: new Date(),
});
console.log(`Checkpoint saved: ${onboardingId} at state ${state}`);
}
/**
* Load state and resume
*/
async loadCheckpoint(onboardingId: string): Promise<{
state: OnboardingState;
context: OnboardingContext;
}> {
const checkpoint = await db('onboarding_state')
.where('onboarding_id', onboardingId)
.first();
if (!checkpoint) {
throw new Error(`No checkpoint found for onboarding ${onboardingId}`);
}
return {
state: checkpoint.current_state,
context: JSON.parse(checkpoint.context),
};
}
/**
* Resume from last successful state
*/
async resume(onboardingId: string): Promise<OnboardingResult> {
const { state, context } = await this.loadCheckpoint(onboardingId);
console.log(`Resuming onboarding ${onboardingId} from state ${state}`);
// Create new state machine starting from saved state
const machine = new OnboardingStateMachine(context, handlers);
machine.setState(state);
return await machine.run();
}
}
2. Incremental Progress Tracking
// src/onboarding/recovery/progress-tracker.ts
export class OnboardingProgressTracker {
/**
* Track completed sub-tasks
*/
async markSubtaskComplete(
onboardingId: string,
state: OnboardingState,
subtask: string
): Promise<void> {
await db('onboarding_progress').insert({
onboarding_id: onboardingId,
state,
subtask,
completed_at: new Date(),
});
}
/**
* Check if subtask already completed
*/
async isSubtaskComplete(
onboardingId: string,
state: OnboardingState,
subtask: string
): Promise<boolean> {
const record = await db('onboarding_progress')
.where({
onboarding_id: onboardingId,
state,
subtask,
})
.first();
return !!record;
}
/**
* Example: Idempotent entity creation
*/
async createEntityIdempotent(
onboardingId: string,
entity: BackstageEntity
): Promise<string> {
const subtask = `entity:${entity.kind}:${entity.metadata.name}`;
// Check if already created
if (await this.isSubtaskComplete(onboardingId, 'GENERATING', subtask)) {
console.log(`Entity ${entity.metadata.name} already created, skipping`);
// Return existing entity ID
const existing = await db('entities')
.where('metadata->name', entity.metadata.name)
.first();
return existing.id;
}
// Create entity
const entityId = await catalog.insertEntity(entity);
// Mark complete
await this.markSubtaskComplete(onboardingId, 'GENERATING', subtask);
return entityId;
}
}
Rollback Mechanisms
1. Transactional Rollback
// src/onboarding/rollback/transactional-rollback.ts
export class TransactionalRollback {
/**
* Execute operation with automatic rollback on failure
*/
async executeWithRollback<T>(
operation: (trx: Knex.Transaction) => Promise<T>,
onboardingId: string
): Promise<T> {
const trx = await db.transaction();
try {
// Execute operation in transaction
const result = await operation(trx);
// Commit transaction
await trx.commit();
return result;
} catch (error) {
// Rollback transaction
await trx.rollback();
console.error(`Transaction rolled back for onboarding ${onboardingId}:`, error);
// Record rollback
await this.recordRollback(onboardingId, error);
throw error;
}
}
/**
* Example: Atomic entity registration
*/
async registerEntitiesAtomic(
onboardingId: string,
entities: BackstageEntity[]
): Promise<string[]> {
return await this.executeWithRollback(async (trx) => {
const entityIds: string[] = [];
for (const entity of entities) {
const [id] = await trx('entities').insert({
api_version: entity.apiVersion,
kind: entity.kind,
metadata: JSON.stringify(entity.metadata),
spec: JSON.stringify(entity.spec),
tenant_id: entity.metadata.annotations['backstage.io/tenant-id'],
}).returning('id');
entityIds.push(id);
}
// All entities inserted - commit
return entityIds;
}, onboardingId);
}
}
2. Compensating Transactions
// src/onboarding/rollback/compensating-rollback.ts
export class CompensatingRollback {
/**
* Track actions for rollback
*/
private actions: RollbackAction[] = [];
/**
* Register an action with its rollback
*/
registerAction(action: RollbackAction): void {
this.actions.push(action);
}
/**
* Execute rollback in reverse order (LIFO)
*/
async rollback(reason: string): Promise<void> {
console.log(`Starting rollback: ${reason}`);
// Rollback in reverse order (undo most recent first)
const reversedActions = [...this.actions].reverse();
for (const action of reversedActions) {
try {
console.log(`Rolling back: ${action.description}`);
await action.rollback();
console.log(`Rolled back: ${action.description}`);
} catch (error) {
console.error(`Rollback failed for ${action.description}:`, error);
// Continue with other rollbacks even if one fails
}
}
console.log('Rollback complete');
}
/**
* Example: Entity creation with rollback
*/
async createEntityWithRollback(
entity: BackstageEntity
): Promise<string> {
const entityId = await catalog.insertEntity(entity);
// Register rollback
this.registerAction({
description: `Delete entity ${entity.metadata.name}`,
rollback: async () => {
await db('entities').where('id', entityId).delete();
},
});
return entityId;
}
/**
* Example: Webhook creation with rollback
*/
async createWebhookWithRollback(
workspaceId: string
): Promise<string> {
const webhookId = await tfc.createNotification({
workspaceId,
name: 'backstage-sync',
destinationType: 'generic',
url: 'https://backstage.example.com/api/sync/tfc-webhook',
triggers: ['run:completed'],
});
// Register rollback
this.registerAction({
description: `Delete TFC webhook ${webhookId}`,
rollback: async () => {
await tfc.deleteNotification(workspaceId, webhookId);
},
});
return webhookId;
}
}
3. Cleanup on Failure
// src/onboarding/rollback/cleanup-handler.ts
export class CleanupHandler {
/**
* Clean up partial onboarding on failure
*/
async cleanup(onboardingId: string, failedState: OnboardingState): Promise<void> {
console.log(`Cleaning up failed onboarding ${onboardingId} from state ${failedState}`);
const context = await this.loadContext(onboardingId);
// Determine what needs cleanup based on failed state
switch (failedState) {
case 'REGISTERING':
case 'CONFIGURING_SYNC':
case 'NOTIFYING':
// Delete entities
await this.deleteEntities(context.entityIds);
break;
case 'GENERATING':
// No cleanup needed (entities not created yet)
break;
default:
// No cleanup needed for early states
break;
}
// Mark as cleaned up
await db('onboarding_history')
.where('id', onboardingId)
.update({
status: 'failed_cleaned_up',
cleaned_up_at: new Date(),
});
}
private async deleteEntities(entityIds: string[]): Promise<void> {
await db.transaction(async (trx) => {
// Delete relationships first
await trx('entity_relationships')
.whereIn('source_id', entityIds)
.orWhereIn('target_id', entityIds)
.delete();
// Delete entities
await trx('entities')
.whereIn('id', entityIds)
.delete();
});
console.log(`Deleted ${entityIds.length} entities`);
}
}
Idempotent Operations
1. Idempotent Entity Creation
// src/onboarding/idempotency/entity-creation.ts
export class IdempotentEntityCreation {
/**
* Create entity if not exists
*/
async createOrGet(entity: BackstageEntity): Promise<string> {
// Check if entity already exists
const existing = await db('entities')
.where({
kind: entity.kind,
'metadata->name': entity.metadata.name,
'metadata->namespace': entity.metadata.namespace,
})
.first();
if (existing) {
console.log(`Entity ${entity.metadata.name} already exists, returning existing ID`);
return existing.id;
}
// Create new entity
const [id] = await db('entities').insert({
api_version: entity.apiVersion,
kind: entity.kind,
metadata: JSON.stringify(entity.metadata),
spec: JSON.stringify(entity.spec),
tenant_id: entity.metadata.annotations['backstage.io/tenant-id'],
}).returning('id');
console.log(`Created entity ${entity.metadata.name} with ID ${id}`);
return id;
}
/**
* Update entity if exists, create if not
*/
async upsert(entity: BackstageEntity): Promise<string> {
const existing = await db('entities')
.where({
kind: entity.kind,
'metadata->name': entity.metadata.name,
'metadata->namespace': entity.metadata.namespace,
})
.first();
if (existing) {
// Update existing
await db('entities')
.where('id', existing.id)
.update({
metadata: JSON.stringify(entity.metadata),
spec: JSON.stringify(entity.spec),
updated_at: new Date(),
});
console.log(`Updated entity ${entity.metadata.name}`);
return existing.id;
}
// Create new
const [id] = await db('entities').insert({
api_version: entity.apiVersion,
kind: entity.kind,
metadata: JSON.stringify(entity.metadata),
spec: JSON.stringify(entity.spec),
tenant_id: entity.metadata.annotations['backstage.io/tenant-id'],
}).returning('id');
console.log(`Created entity ${entity.metadata.name}`);
return id;
}
}
2. Idempotent Webhook Creation
// src/onboarding/idempotency/webhook-creation.ts
export class IdempotentWebhookCreation {
/**
* Create webhook if not exists
*/
async createOrGetTfcWebhook(
workspaceId: string,
config: WebhookConfig
): Promise<string> {
// Check if webhook already exists
const existingWebhooks = await tfc.listNotifications(workspaceId);
const existing = existingWebhooks.find(w =>
w.name === config.name &&
w.url === config.url
);
if (existing) {
console.log(`Webhook ${config.name} already exists for workspace ${workspaceId}`);
return existing.id;
}
// Create new webhook
const webhookId = await tfc.createNotification({
workspaceId,
...config,
});
console.log(`Created webhook ${config.name} for workspace ${workspaceId}`);
return webhookId;
}
/**
* Create GitHub webhook if not exists
*/
async createOrGetGitHubWebhook(
owner: string,
repo: string,
config: GitHubWebhookConfig
): Promise<number> {
// List existing webhooks
const { data: webhooks } = await github.repos.listWebhooks({ owner, repo });
const existing = webhooks.find(w =>
w.config.url === config.url
);
if (existing) {
console.log(`GitHub webhook already exists for ${owner}/${repo}`);
return existing.id;
}
// Create new webhook
const { data: webhook } = await github.repos.createWebhook({
owner,
repo,
config: {
url: config.url,
content_type: 'json',
secret: config.secret,
},
events: config.events,
});
console.log(`Created GitHub webhook for ${owner}/${repo}`);
return webhook.id;
}
}
Testing Idempotency
// tests/integration/idempotency.test.ts
describe('Onboarding Idempotency', () => {
it('should return same result when triggered twice', async () => {
const context = {
tenantId: 'tenant-123',
repoUrl: 'https://github.com/acme-corp/bu-finance-infrastructure',
workspaceId: 'ws-abc123',
};
// First onboarding
const result1 = await onboardingService.trigger(context);
expect(result1.status).toBe('completed');
// Second onboarding (should be idempotent)
const result2 = await onboardingService.trigger(context);
expect(result2.status).toBe('already_completed');
expect(result2.onboardingId).toBe(result1.onboardingId);
expect(result2.entities).toEqual(result1.entities);
});
it('should resume from failure', async () => {
// Start onboarding
const context = { /* ... */ };
const onboardingId = await onboardingService.trigger(context);
// Simulate failure at GENERATING state
await simulateFailure(onboardingId, 'GENERATING');
// Resume (should continue from GENERATING)
const result = await onboardingService.resume(onboardingId);
expect(result.status).toBe('completed');
});
it('should not create duplicate entities', async () => {
const entity = createTestEntity();
// Create entity twice
const id1 = await entityCreator.createOrGet(entity);
const id2 = await entityCreator.createOrGet(entity);
// Should return same ID
expect(id1).toBe(id2);
// Verify only one entity in database
const count = await db('entities')
.where('metadata->name', entity.metadata.name)
.count();
expect(count[0].count).toBe('1');
});
});
Summary
Idempotency Guarantees
✅ Safe to retry any operation ✅ No duplicate entities created ✅ No duplicate webhooks created ✅ Partial progress preserved ✅ Automatic rollback on failure ✅ Consistent state after retry
Retry Strategy
| State | Max Attempts | Base Delay | Max Delay | Rollback |
|---|---|---|---|---|
| VALIDATING | 1 | - | - | No |
| DISCOVERING | 3 | 2s | 30s | No |
| EXTRACTING | 3 | 1s | 15s | No |
| GENERATING | 3 | 1s | 10s | No |
| REGISTERING | 5 | 2s | 60s | Yes |
| CONFIGURING_SYNC | 3 | 2s | 30s | Yes |
| ROLLING_BACK | 5 | 3s | 60s | N/A |
Next Steps
See 08-synchronization-setup.md for ongoing sync configuration.