PAA: English Pronunciation Assessment Agent

Evaluate spoken English intelligibility, stress, fluency, and grammatical correctness using AI. Please speak freely.

Speak up to two minutes:

Not recording.

Expected Text (Optional for Constrained Speech mode):

Analysis Result

Words Per Minute

Pronunciation

Stress

Spoken Fluency

Language Use in Context

Transcript

Two Syllable Word Stress Details

Word	Word Index	Part of Speech	Expected Stress	Inferred Stress	Syllable 1 Duration	Syllable 2 Duration	Duration Ratio	Score	Confidence	Feedback

Timing

Raw Analysis JSON

Developers: A2A usage

Agent Card URL: /.well-known/agent.json
A2A JSON-RPC endpoint: /a2a (methods: agent.about, pronunciation.evaluate)
Health endpoint: /api/healthz

Example request: pronunciation.evaluate

This service provides an A2A-compatible JSON-RPC endpoint for remote agents to evaluate freeform spoken English across four constructs (Pronunciation, Stress, Fluency, Grammar).

Optional parameter: expected_text (string). When provided, the backend computes constrained speech match metrics (including Levenshtein-based comparison) against the expected text.

{
  "jsonrpc": "2.0",
  "id": "1",
  "method": "pronunciation.evaluate",
  "params": {
    "audio_wav_base64": "UklGRiQAAABXQVZFZm10IBAAAAABAAEAQB8AAEAfAAABAAgAZGF0YQAAAAA=",
    "deepgram_api_key": "optional_override_key",
    "gemini_api_key": "optional_override_key",
    "expected_text": "optional reference transcript used for constrained speech match"
  }
}

Example response

This sample shows the structure you receive after a successful API call. Use this shape to map results into UI components and interpret success.

{
  "jsonrpc": "2.0",
  "id": "1",
  "result": {
    "audio": {
      "bytes": 190764,
      "channels": 1,
      "content_type": "audio/wav",
      "path": "https://storage.googleapis.com/syllable-stress/260501194834393567p.wav",
      "sample_rate_hz": 16000,
      "sample_width_bytes": 2
    },
    "constrained_alignment": [
      {
        "status": "missing",
        "word": "plain",
        "word_index": 13
      },
      {
        "status": "unexpected",
        "word": "plane",
        "word_index": 13
      }
    ],
    "created_at_hst": "2026-05-01T19:48:34.393567-10:00",
    "deepgram_words": [
      {
        "confidence": 0.99658203,
        "confidence_cubed": 0.98977935,
        "end": 0.51,
        "index": 1,
        "start": 0.05,
        "word": "Please"
      },
      {
        "confidence": 1.0,
        "confidence_cubed": 1.0,
        "end": 1.13,
        "index": 2,
        "start": 0.51,
        "word": "record"
      },
      {
        "confidence": 0.9941406,
        "confidence_cubed": 0.9825227,
        "end": 1.29,
        "index": 3,
        "start": 1.13,
        "word": "the"
      },
      {
        "confidence": 0.9995117,
        "confidence_cubed": 0.9985358,
        "end": 1.77,
        "index": 4,
        "start": 1.29,
        "word": "record"
      }
    ],
    "expected_text": "Please record the record. The rain in Spain falls mainly on the plain.",
    "grammar-errors-list": [
      "Use 'plain' instead of 'plane' in this context."
    ],
    "persistence": {
      "json_path": "https://storage.googleapis.com/syllable-stress/260501194834393567p.json",
      "wav_path": "https://storage.googleapis.com/syllable-stress/260501194834393567p.wav"
    },
    "pipeline": {
      "aligner": "pocketsphinx",
      "asr_model": "nova-2",
      "asr_provider": "deepgram",
      "gemini_model_name": "gemini-2.5-flash-lite"
    },
    "recording_id": "260501194834393567p",
    "render_words": [
      {
        "text": "Please",
        "word_confidence_cubed": 0.99
      },
      {
        "text": "record",
        "word_confidence_cubed": 1.0,
        "is_target": true,
        "target_status": "unaligned",
        "target_correct": false
      },
      {
        "text": "the",
        "word_confidence_cubed": 0.98
      },
      {
        "text": "record.",
        "word_confidence_cubed": 0.99,
        "is_target": true,
        "target_status": null,
        "target_correct": false
      }
    ],
    "request_id": "9951c1b6-727a-4c74-9faa-cbd0c7569d4b",
    "request_ip": "2603:800c:1200:596a:22cf:c2de:e37a:2bb",
    "schema_version": 1,
    "scores": {
      "constrained_match": 0.9231,
      "fluency": 10.5,
      "fluency_confidence_range": 1.1292,
      "language_use": 1,
      "pronunciation": 0.9598,
      "stress": 0.4406,
      "words_per_minute": 130.9
    },
    "source": "web_ui",
    "targets": [
      {
        "core_durations": {
          "syll1": 0.05,
          "syll2": 0.08
        },
        "core_phones": {
          "syll1": "IH",
          "syll2": "AO"
        },
        "correct": 0.4363,
        "deepgram_confidence": 1,
        "deepgram_confidence_cubed": 1,
        "deepgram_word_index": 1,
        "duration_ratio": 0.625,
        "duration_ratio_log": -0.469255,
        "expected_stress": 2,
        "feedback": "Incorrect stress. (expected syllable 2)",
        "inferred_stress": 1,
        "token_index": 1,
        "word_display": "record",
        "word_norm": "record"
      },
      {
        "core_durations": {
          "syll1": 0.05,
          "syll2": 0.11
        },
        "core_phones": {
          "syll1": "EH",
          "syll2": "ER"
        },
        "correct": 0.445,
        "deepgram_confidence": 0.9995117,
        "deepgram_confidence_cubed": 0.9985,
        "deepgram_word_index": 3,
        "duration_ratio": 0.454545,
        "duration_ratio_log": -0.787368,
        "expected_stress": 1,
        "feedback": "Incorrect stress. (expected syllable 1)",
        "inferred_stress": 2,
        "token_index": 3,
        "word_display": "record",
        "word_norm": "record"
      }
    ],
    "timezone": "Pacific/Honolulu",
    "timing": {
      "bucket_json_read_process_sec": 0.108,
      "deepgram_api_sec": 0.477,
      "grammar_llm_sec": 2.031,
      "levenshtein_distance_calc_sec": 0,
      "persist_output_files_sec": 0.004,
      "pocketsphinx_alignment_sec": 0.391,
      "pos_tagging_llm_sec": 1.831,
      "recording_duration_sec": 5.96,
      "vocab_frequency_analysis_sec": 0
    },
    "transcript": "Please record the record. The rain in Spain falls mainly on the plane."
  }
}

JSON results schema

This is the formal JSON schema defining the structure of the API response.

{
  "$schema": "http://json-schema.org/draft-07/schema#",
  "title": "Pronunciation Evaluation Response",
  "description": "Formal schema for the JSON-RPC response returned by the /a2a endpoint for pronunciation evaluation.",
  "type": "object",
  "required": [
    "jsonrpc",
    "id"
  ],
  "oneOf": [
    {
      "required": [
        "result"
      ]
    },
    {
      "required": [
        "error"
      ]
    }
  ],
  "properties": {
    "jsonrpc": {
      "type": "string",
      "const": "2.0",
      "description": "JSON-RPC protocol version"
    },
    "id": {
      "type": [
        "string",
        "number",
        "null"
      ],
      "description": "JSON-RPC request identifier"
    },
    "error": {
      "type": "object",
      "description": "JSON-RPC error object containing details of a failure.",
      "required": [
        "code",
        "message"
      ],
      "properties": {
        "code": {
          "type": "integer",
          "description": "A Number that indicates the error type that occurred."
        },
        "message": {
          "type": "string",
          "description": "A String providing a short description of the error."
        },
        "request_id": {
          "type": [
            "string",
            "null"
          ],
          "description": "Unique identifier for the API request."
        },
        "data": {
          "type": "object",
          "description": "Additional error telemetry and metadata, typically containing the persistence sidecar structure without the analysis metrics.",
          "required": [
            "persistence",
            "schema_version",
            "recording_id",
            "created_at_hst",
            "timezone",
            "source",
            "audio",
            "pipeline"
          ]
        }
      }
    },
    "result": {
      "type": "object",
      "description": "The result of the speech evaluation containing metrics, alignments, and metadata.",
      "required": [
        "request_id",
        "scores",
        "persistence",
        "timing",
        "transcript",
        "schema_version",
        "recording_id",
        "created_at_hst",
        "timezone",
        "source",
        "audio",
        "pipeline"
      ],
      "properties": {
        "audio": {
          "type": "object",
          "description": "Metadata for the persisted audio file.",
          "properties": {
            "path": {
              "type": "string"
            },
            "bytes": {
              "type": "integer"
            },
            "content_type": {
              "type": "string"
            },
            "sample_rate_hz": {
              "type": "integer"
            },
            "channels": {
              "type": "integer"
            },
            "sample_width_bytes": {
              "type": "integer"
            }
          }
        },
        "constrained_alignment": {
          "type": "array",
          "description": "List of mismatched words when expected_text is provided, detailing missing or unexpected entries.",
          "items": {
            "type": "object",
            "properties": {
              "status": {
                "type": "string",
                "enum": [
                  "missing",
                  "unexpected"
                ]
              },
              "word": {
                "type": "string"
              },
              "word_index": {
                "type": "integer",
                "description": "1-based index position in expected text or transcript."
              }
            }
          }
        },
        "created_at_hst": {
          "type": "string",
          "description": "ISO timestamp of recording in HST timezone."
        },
        "deepgram_words": {
          "type": "array",
          "description": "Raw metadata for every word transcribed by Deepgram ASR.",
          "items": {
            "type": "object",
            "properties": {
              "index": {
                "type": "integer"
              },
              "word": {
                "type": "string"
              },
              "start": {
                "type": "number"
              },
              "end": {
                "type": "number"
              },
              "confidence": {
                "type": "number"
              },
              "confidence_cubed": {
                "type": [
                  "number",
                  "null"
                ]
              }
            }
          }
        },
        "expected_text": {
          "type": [
            "string",
            "null"
          ],
          "description": "The expected transcript provided in the request for constrained matching."
        },
        "grammar-errors-list": {
          "type": "array",
          "description": "List of up to 3 descriptions of grammatical errors found in the transcript.",
          "items": {
            "type": "string"
          }
        },
        "persistence": {
          "type": "object",
          "description": "Public URLs to the saved payload and audio files.",
          "properties": {
            "json_path": {
              "type": "string"
            },
            "wav_path": {
              "type": "string"
            }
          }
        },
        "pipeline": {
          "type": "object",
          "description": "Static pipeline configurations.",
          "properties": {
            "aligner": {
              "type": "string"
            },
            "asr_model": {
              "type": "string"
            },
            "asr_provider": {
              "type": "string"
            },
            "gemini_model_name": {
              "type": "string"
            }
          }
        },
        "recording_id": {
          "type": "string",
          "description": "Primary key string."
        },
        "render_words": {
          "type": "array",
          "description": "List of recognized words with styling metadata for frontend rendering.",
          "items": {
            "type": "object",
            "properties": {
              "text": {
                "type": "string"
              },
              "word_confidence_cubed": {
                "type": [
                  "number",
                  "null"
                ]
              },
              "is_target": {
                "type": "boolean"
              },
              "target_status": {
                "type": [
                  "string",
                  "null"
                ]
              },
              "target_correct": {
                "type": [
                  "boolean",
                  "null"
                ]
              }
            }
          }
        },
        "request_id": {
          "type": "string",
          "description": "Unique identifier for the API request."
        },
        "request_ip": {
          "type": [
            "string",
            "null"
          ],
          "description": "User IP."
        },
        "schema_version": {
          "type": "integer",
          "description": "Data format revision."
        },
        "scores": {
          "type": "object",
          "description": "High-level aggregate metrics evaluating the speech.",
          "properties": {
            "constrained_match": {
              "type": [
                "number",
                "null"
              ],
              "description": "0.0 to 1.0 score indicating how well the spoken audio matches the expected_text using Levenshtein distance."
            },
            "fluency": {
              "type": [
                "number",
                "null"
              ],
              "description": "Estimated reading grade level (3.0-10.0) based on the rank of recognized words."
            },
            "fluency_confidence_range": {
              "type": [
                "number",
                "null"
              ],
              "description": "The +/- 50% confidence interval margin for the fluency grade prediction."
            },
            "language_use": {
              "type": [
                "number",
                "null"
              ],
              "description": "0.0 to 1.0 grammatical correctness score evaluated by an LLM."
            },
            "pronunciation": {
              "type": [
                "number",
                "null"
              ],
              "description": "0.0 to 1.0 score based on the geometric mean of the cubed ASR confidence values."
            },
            "stress": {
              "type": [
                "number",
                "null"
              ],
              "description": "0.0 to 1.0 aggregate score indicating the correctness of syllable stress on multi-syllable target words."
            },
            "words_per_minute": {
              "type": [
                "number",
                "null"
              ],
              "description": "Speaking rate measured in words per minute."
            }
          }
        },
        "source": {
          "type": "string",
          "description": "Origin of the request, e.g. web_ui or a2a."
        },
        "targets": {
          "type": "array",
          "description": "Detailed syllable stress evaluation for target multi-syllable words.",
          "items": {
            "type": "object",
            "properties": {
              "core_durations": {
                "type": "object",
                "additionalProperties": {
                  "type": "number"
                },
                "description": "Duration in seconds of core vowel phonemes."
              },
              "core_phones": {
                "type": "object",
                "additionalProperties": {
                  "type": "string"
                },
                "description": "Phonetic representation of core vowel phonemes."
              },
              "correct": {
                "type": "number",
                "description": "Continuous score (0.0 to 1.0) indicating stress correctness."
              },
              "decision_confidence": {
                "type": [
                  "number",
                  "null"
                ],
                "description": "Percentage confidence of the stress decision."
              },
              "deepgram_confidence": {
                "type": "number"
              },
              "deepgram_confidence_cubed": {
                "type": "number"
              },
              "deepgram_word_index": {
                "type": "integer"
              },
              "duration_ratio": {
                "type": "number"
              },
              "duration_ratio_log": {
                "type": "number"
              },
              "expected_stress": {
                "type": "integer",
                "description": "Expected stressed syllable index (e.g. 1 for Noun, 2 for Verb)."
              },
              "feedback": {
                "type": "string"
              },
              "inferred_stress": {
                "type": [
                  "integer",
                  "null"
                ]
              },
              "token_index": {
                "type": "integer"
              },
              "word_display": {
                "type": "string"
              },
              "word_norm": {
                "type": "string"
              },
              "status": {
                "type": "string",
                "enum": [
                  "unaligned",
                  "error"
                ],
                "description": "Present only if alignment failed."
              }
            }
          }
        },
        "timezone": {
          "type": "string",
          "description": "Declared timezone."
        },
        "timing": {
          "type": "object",
          "description": "Execution durations in seconds for various pipeline stages.",
          "properties": {
            "recording_duration_sec": {
              "type": "number"
            },
            "deepgram_api_sec": {
              "type": "number"
            },
            "vocab_frequency_analysis_sec": {
              "type": "number"
            },
            "grammar_llm_sec": {
              "type": [
                "number",
                "null"
              ]
            },
            "persist_output_files_sec": {
              "type": "number"
            },
            "pos_tagging_llm_sec": {
              "type": [
                "number",
                "null"
              ]
            },
            "bucket_json_read_process_sec": {
              "type": [
                "number",
                "null"
              ]
            },
            "pocketsphinx_alignment_sec": {
              "type": [
                "number",
                "null"
              ]
            },
            "levenshtein_distance_calc_sec": {
              "type": [
                "number",
                "null"
              ]
            }
          }
        },
        "transcript": {
          "type": [
            "string",
            "null"
          ],
          "description": "The fully punctuated transcript returned by the ASR system."
        }
      }
    }
  }
}

Syllable Stress Words

The 69 words are: abstract, accent, address, affect, ally, circuit, combat, combine, compact, compound, concert, concrete, conduct, conflict, console, content, contest, contract, contrast, convert, costume, debut, decrease, desert, detail, discount, effect, escort, essay, export, extract, finance, impact, import, increase, insert, invite, object, perfect, permit, premiere, present, produce, progress, project, prospect, protest, rebel, recall, record, reform, refuse, reject, rescue, research, retreat, romance, segment, sentence, shanghai, subject, survey, suspect, transfer, transport, update, upgrade, upset, weekend.

How to call this agent from Strapi

The following example demonstrates how to call this agent from a Strapi back-end system using fetch().

module.exports = {
  async evaluatePronunciation(ctx) {
    // Assuming you receive the base64 audio in the request body
    const { audioBase64, expectedText } = ctx.request.body;

    // Define the JSON-RPC payload
    const payload = {
      jsonrpc: "2.0",
      id: Date.now().toString(),
      method: "pronunciation.evaluate",
      params: {
        audio_wav_base64: audioBase64,
        expected_text: expectedText
      }
    };

    try {
      // Call the agent's A2A endpoint
      // Replace with your actual deployed agent URL
      const response = await fetch('https://paa.talknicer.com/a2a', {
        method: 'POST',
        headers: {
          'Content-Type': 'application/json'
        },
        body: JSON.stringify(payload)
      });

      if (!response.ok) {
        throw new Error(`Agent error: ${response.status}`);
      }

      const data = await response.json();
      return ctx.send(data);
    } catch (err) {
      return ctx.internalServerError('Failed to evaluate speech', { error: err.message });
    }
  }
};

Bring Your Own Deepgram API Key

The shared Deepgram API key used by this demo will probably not last forever. To ensure uninterrupted access, you can provide your own — Deepgram offers $200 in free credits with no credit card required. Sign up for Deepgram and generate an API key.

Your Deepgram API Key:

Using shared API key

Bring Your Own Gemini API Key

The shared Gemini API key used by this demo will probably not last forever. To ensure uninterrupted access, you can provide your own — Google offers free credits. Generate a Gemini API key.

Your Gemini API Key: