PAA: English Pronunciation Assessment Agent
Evaluate spoken English intelligibility, stress, fluency, and grammatical correctness using AI. Please speak freely.
Analysis Result
Transcript
Two Syllable Word Stress Details
| Word | Word Index | Part of Speech | Expected Stress | Inferred Stress | Syllable 1 Duration | Syllable 2 Duration | Duration Ratio | Score | Confidence | Feedback |
|---|
Timing
Developers: A2A usage
- Agent Card URL:
/.well-known/agent.json - A2A JSON-RPC endpoint:
/a2a(methods:agent.about,pronunciation.evaluate) - Health endpoint:
/api/healthz
Example request: pronunciation.evaluate
This service provides an A2A-compatible JSON-RPC endpoint for remote agents to evaluate freeform spoken English across four constructs (Pronunciation, Stress, Fluency, Grammar).
Optional parameter: expected_text (string). When provided, the backend computes constrained speech match metrics (including Levenshtein-based comparison) against the expected text.
{
"jsonrpc": "2.0",
"id": "1",
"method": "pronunciation.evaluate",
"params": {
"audio_wav_base64": "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=",
"deepgram_api_key": "optional_override_key",
"gemini_api_key": "optional_override_key",
"expected_text": "optional reference transcript used for constrained speech match"
}
}
Example response
This sample shows the structure you receive after a successful API call. Use this shape to map results into UI components and interpret success.
{
"jsonrpc": "2.0",
"id": "1",
"result": {
"audio": {
"bytes": 190764,
"channels": 1,
"content_type": "audio/wav",
"path": "https://storage.googleapis.com/syllable-stress/260501194834393567p.wav",
"sample_rate_hz": 16000,
"sample_width_bytes": 2
},
"constrained_alignment": [
{
"status": "missing",
"word": "plain",
"word_index": 13
},
{
"status": "unexpected",
"word": "plane",
"word_index": 13
}
],
"created_at_hst": "2026-05-01T19:48:34.393567-10:00",
"deepgram_words": [
{
"confidence": 0.99658203,
"confidence_cubed": 0.98977935,
"end": 0.51,
"index": 1,
"start": 0.05,
"word": "Please"
},
{
"confidence": 1.0,
"confidence_cubed": 1.0,
"end": 1.13,
"index": 2,
"start": 0.51,
"word": "record"
},
{
"confidence": 0.9941406,
"confidence_cubed": 0.9825227,
"end": 1.29,
"index": 3,
"start": 1.13,
"word": "the"
},
{
"confidence": 0.9995117,
"confidence_cubed": 0.9985358,
"end": 1.77,
"index": 4,
"start": 1.29,
"word": "record"
}
],
"expected_text": "Please record the record. The rain in Spain falls mainly on the plain.",
"grammar-errors-list": [
"Use 'plain' instead of 'plane' in this context."
],
"persistence": {
"json_path": "https://storage.googleapis.com/syllable-stress/260501194834393567p.json",
"wav_path": "https://storage.googleapis.com/syllable-stress/260501194834393567p.wav"
},
"pipeline": {
"aligner": "pocketsphinx",
"asr_model": "nova-2",
"asr_provider": "deepgram",
"gemini_model_name": "gemini-2.5-flash-lite"
},
"recording_id": "260501194834393567p",
"render_words": [
{
"text": "Please",
"word_confidence_cubed": 0.99
},
{
"text": "record",
"word_confidence_cubed": 1.0,
"is_target": true,
"target_status": "unaligned",
"target_correct": false
},
{
"text": "the",
"word_confidence_cubed": 0.98
},
{
"text": "record.",
"word_confidence_cubed": 0.99,
"is_target": true,
"target_status": null,
"target_correct": false
}
],
"request_id": "9951c1b6-727a-4c74-9faa-cbd0c7569d4b",
"request_ip": "2603:800c:1200:596a:22cf:c2de:e37a:2bb",
"schema_version": 1,
"scores": {
"constrained_match": 0.9231,
"fluency": 10.5,
"fluency_confidence_range": 1.1292,
"language_use": 1,
"pronunciation": 0.9598,
"stress": 0.4406,
"words_per_minute": 130.9
},
"source": "web_ui",
"targets": [
{
"core_durations": {
"syll1": 0.05,
"syll2": 0.08
},
"core_phones": {
"syll1": "IH",
"syll2": "AO"
},
"correct": 0.4363,
"deepgram_confidence": 1,
"deepgram_confidence_cubed": 1,
"deepgram_word_index": 1,
"duration_ratio": 0.625,
"duration_ratio_log": -0.469255,
"expected_stress": 2,
"feedback": "Incorrect stress. (expected syllable 2)",
"inferred_stress": 1,
"token_index": 1,
"word_display": "record",
"word_norm": "record"
},
{
"core_durations": {
"syll1": 0.05,
"syll2": 0.11
},
"core_phones": {
"syll1": "EH",
"syll2": "ER"
},
"correct": 0.445,
"deepgram_confidence": 0.9995117,
"deepgram_confidence_cubed": 0.9985,
"deepgram_word_index": 3,
"duration_ratio": 0.454545,
"duration_ratio_log": -0.787368,
"expected_stress": 1,
"feedback": "Incorrect stress. (expected syllable 1)",
"inferred_stress": 2,
"token_index": 3,
"word_display": "record",
"word_norm": "record"
}
],
"timezone": "Pacific/Honolulu",
"timing": {
"bucket_json_read_process_sec": 0.108,
"deepgram_api_sec": 0.477,
"grammar_llm_sec": 2.031,
"levenshtein_distance_calc_sec": 0,
"persist_output_files_sec": 0.004,
"pocketsphinx_alignment_sec": 0.391,
"pos_tagging_llm_sec": 1.831,
"recording_duration_sec": 5.96,
"vocab_frequency_analysis_sec": 0
},
"transcript": "Please record the record. The rain in Spain falls mainly on the plane."
}
}
JSON results schema
This is the formal JSON schema defining the structure of the API response.
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "Pronunciation Evaluation Response",
"description": "Formal schema for the JSON-RPC response returned by the /a2a endpoint for pronunciation evaluation.",
"type": "object",
"required": [
"jsonrpc",
"id"
],
"oneOf": [
{
"required": [
"result"
]
},
{
"required": [
"error"
]
}
],
"properties": {
"jsonrpc": {
"type": "string",
"const": "2.0",
"description": "JSON-RPC protocol version"
},
"id": {
"type": [
"string",
"number",
"null"
],
"description": "JSON-RPC request identifier"
},
"error": {
"type": "object",
"description": "JSON-RPC error object containing details of a failure.",
"required": [
"code",
"message"
],
"properties": {
"code": {
"type": "integer",
"description": "A Number that indicates the error type that occurred."
},
"message": {
"type": "string",
"description": "A String providing a short description of the error."
},
"request_id": {
"type": [
"string",
"null"
],
"description": "Unique identifier for the API request."
},
"data": {
"type": "object",
"description": "Additional error telemetry and metadata, typically containing the persistence sidecar structure without the analysis metrics.",
"required": [
"persistence",
"schema_version",
"recording_id",
"created_at_hst",
"timezone",
"source",
"audio",
"pipeline"
]
}
}
},
"result": {
"type": "object",
"description": "The result of the speech evaluation containing metrics, alignments, and metadata.",
"required": [
"request_id",
"scores",
"persistence",
"timing",
"transcript",
"schema_version",
"recording_id",
"created_at_hst",
"timezone",
"source",
"audio",
"pipeline"
],
"properties": {
"audio": {
"type": "object",
"description": "Metadata for the persisted audio file.",
"properties": {
"path": {
"type": "string"
},
"bytes": {
"type": "integer"
},
"content_type": {
"type": "string"
},
"sample_rate_hz": {
"type": "integer"
},
"channels": {
"type": "integer"
},
"sample_width_bytes": {
"type": "integer"
}
}
},
"constrained_alignment": {
"type": "array",
"description": "List of mismatched words when expected_text is provided, detailing missing or unexpected entries.",
"items": {
"type": "object",
"properties": {
"status": {
"type": "string",
"enum": [
"missing",
"unexpected"
]
},
"word": {
"type": "string"
},
"word_index": {
"type": "integer",
"description": "1-based index position in expected text or transcript."
}
}
}
},
"created_at_hst": {
"type": "string",
"description": "ISO timestamp of recording in HST timezone."
},
"deepgram_words": {
"type": "array",
"description": "Raw metadata for every word transcribed by Deepgram ASR.",
"items": {
"type": "object",
"properties": {
"index": {
"type": "integer"
},
"word": {
"type": "string"
},
"start": {
"type": "number"
},
"end": {
"type": "number"
},
"confidence": {
"type": "number"
},
"confidence_cubed": {
"type": [
"number",
"null"
]
}
}
}
},
"expected_text": {
"type": [
"string",
"null"
],
"description": "The expected transcript provided in the request for constrained matching."
},
"grammar-errors-list": {
"type": "array",
"description": "List of up to 3 descriptions of grammatical errors found in the transcript.",
"items": {
"type": "string"
}
},
"persistence": {
"type": "object",
"description": "Public URLs to the saved payload and audio files.",
"properties": {
"json_path": {
"type": "string"
},
"wav_path": {
"type": "string"
}
}
},
"pipeline": {
"type": "object",
"description": "Static pipeline configurations.",
"properties": {
"aligner": {
"type": "string"
},
"asr_model": {
"type": "string"
},
"asr_provider": {
"type": "string"
},
"gemini_model_name": {
"type": "string"
}
}
},
"recording_id": {
"type": "string",
"description": "Primary key string."
},
"render_words": {
"type": "array",
"description": "List of recognized words with styling metadata for frontend rendering.",
"items": {
"type": "object",
"properties": {
"text": {
"type": "string"
},
"word_confidence_cubed": {
"type": [
"number",
"null"
]
},
"is_target": {
"type": "boolean"
},
"target_status": {
"type": [
"string",
"null"
]
},
"target_correct": {
"type": [
"boolean",
"null"
]
}
}
}
},
"request_id": {
"type": "string",
"description": "Unique identifier for the API request."
},
"request_ip": {
"type": [
"string",
"null"
],
"description": "User IP."
},
"schema_version": {
"type": "integer",
"description": "Data format revision."
},
"scores": {
"type": "object",
"description": "High-level aggregate metrics evaluating the speech.",
"properties": {
"constrained_match": {
"type": [
"number",
"null"
],
"description": "0.0 to 1.0 score indicating how well the spoken audio matches the expected_text using Levenshtein distance."
},
"fluency": {
"type": [
"number",
"null"
],
"description": "Estimated reading grade level (3.0-10.0) based on the rank of recognized words."
},
"fluency_confidence_range": {
"type": [
"number",
"null"
],
"description": "The +/- 50% confidence interval margin for the fluency grade prediction."
},
"language_use": {
"type": [
"number",
"null"
],
"description": "0.0 to 1.0 grammatical correctness score evaluated by an LLM."
},
"pronunciation": {
"type": [
"number",
"null"
],
"description": "0.0 to 1.0 score based on the geometric mean of the cubed ASR confidence values."
},
"stress": {
"type": [
"number",
"null"
],
"description": "0.0 to 1.0 aggregate score indicating the correctness of syllable stress on multi-syllable target words."
},
"words_per_minute": {
"type": [
"number",
"null"
],
"description": "Speaking rate measured in words per minute."
}
}
},
"source": {
"type": "string",
"description": "Origin of the request, e.g. web_ui or a2a."
},
"targets": {
"type": "array",
"description": "Detailed syllable stress evaluation for target multi-syllable words.",
"items": {
"type": "object",
"properties": {
"core_durations": {
"type": "object",
"additionalProperties": {
"type": "number"
},
"description": "Duration in seconds of core vowel phonemes."
},
"core_phones": {
"type": "object",
"additionalProperties": {
"type": "string"
},
"description": "Phonetic representation of core vowel phonemes."
},
"correct": {
"type": "number",
"description": "Continuous score (0.0 to 1.0) indicating stress correctness."
},
"decision_confidence": {
"type": [
"number",
"null"
],
"description": "Percentage confidence of the stress decision."
},
"deepgram_confidence": {
"type": "number"
},
"deepgram_confidence_cubed": {
"type": "number"
},
"deepgram_word_index": {
"type": "integer"
},
"duration_ratio": {
"type": "number"
},
"duration_ratio_log": {
"type": "number"
},
"expected_stress": {
"type": "integer",
"description": "Expected stressed syllable index (e.g. 1 for Noun, 2 for Verb)."
},
"feedback": {
"type": "string"
},
"inferred_stress": {
"type": [
"integer",
"null"
]
},
"token_index": {
"type": "integer"
},
"word_display": {
"type": "string"
},
"word_norm": {
"type": "string"
},
"status": {
"type": "string",
"enum": [
"unaligned",
"error"
],
"description": "Present only if alignment failed."
}
}
}
},
"timezone": {
"type": "string",
"description": "Declared timezone."
},
"timing": {
"type": "object",
"description": "Execution durations in seconds for various pipeline stages.",
"properties": {
"recording_duration_sec": {
"type": "number"
},
"deepgram_api_sec": {
"type": "number"
},
"vocab_frequency_analysis_sec": {
"type": "number"
},
"grammar_llm_sec": {
"type": [
"number",
"null"
]
},
"persist_output_files_sec": {
"type": "number"
},
"pos_tagging_llm_sec": {
"type": [
"number",
"null"
]
},
"bucket_json_read_process_sec": {
"type": [
"number",
"null"
]
},
"pocketsphinx_alignment_sec": {
"type": [
"number",
"null"
]
},
"levenshtein_distance_calc_sec": {
"type": [
"number",
"null"
]
}
}
},
"transcript": {
"type": [
"string",
"null"
],
"description": "The fully punctuated transcript returned by the ASR system."
}
}
}
}
}
Syllable Stress Words
The 69 words are: abstract, accent, address, affect, ally, circuit, combat, combine, compact, compound, concert, concrete, conduct, conflict, console, content, contest, contract, contrast, convert, costume, debut, decrease, desert, detail, discount, effect, escort, essay, export, extract, finance, impact, import, increase, insert, invite, object, perfect, permit, premiere, present, produce, progress, project, prospect, protest, rebel, recall, record, reform, refuse, reject, rescue, research, retreat, romance, segment, sentence, shanghai, subject, survey, suspect, transfer, transport, update, upgrade, upset, weekend.
How to call this agent from Strapi
The following example demonstrates how to call this agent from a Strapi back-end system using fetch().
module.exports = {
async evaluatePronunciation(ctx) {
// Assuming you receive the base64 audio in the request body
const { audioBase64, expectedText } = ctx.request.body;
// Define the JSON-RPC payload
const payload = {
jsonrpc: "2.0",
id: Date.now().toString(),
method: "pronunciation.evaluate",
params: {
audio_wav_base64: audioBase64,
expected_text: expectedText
}
};
try {
// Call the agent's A2A endpoint
// Replace with your actual deployed agent URL
const response = await fetch('https://paa.talknicer.com/a2a', {
method: 'POST',
headers: {
'Content-Type': 'application/json'
},
body: JSON.stringify(payload)
});
if (!response.ok) {
throw new Error(`Agent error: ${response.status}`);
}
const data = await response.json();
return ctx.send(data);
} catch (err) {
return ctx.internalServerError('Failed to evaluate speech', { error: err.message });
}
}
};
Bring Your Own Deepgram API Key
The shared Deepgram API key used by this demo will probably not last forever. To ensure uninterrupted access, you can provide your own — Deepgram offers $200 in free credits with no credit card required. Sign up for Deepgram and generate an API key.
Using shared API key
Bring Your Own Gemini API Key
The shared Gemini API key used by this demo will probably not last forever. To ensure uninterrupted access, you can provide your own — Google offers free credits. Generate a Gemini API key.
Using shared API key