AI – Weekend Enthusiast

My disappointment with my Amazon Echo/Alexa device doubled every time I tried to use it and, after some recent exploration with live video streaming, I wanted to pair my desire for a quirky voice assistant with my desire to learn more about audio stream handling in javascript. The result is a cheesy AI voice assistant that sometimes feeds me a good dad joke.

Click here to give it a try for yourself!

How it works

Establishing Audio Input. The script begins with creating a media stream/getting the mic input. It’ll then listen and analyze the background amplitude / sound level to determine how much background noise is present so it can distinguish between background noise and the user talking to it. This eliminates the need for a “wake word” like “Alexa” or “Hey Google”. Once the baseline amplitude has been determined, we add 10 to it just to give a bit more buffer to distinguish background noise from the user speaking to the assistant. This baseline amplitude is defined a “Threshold” on the UI.

const checkAmplitude = () => {
	whatsHappeningDiv.innerHTML = 'Calibrating . . .';
	analyser.getByteFrequencyData(dataArray);
	const average = dataArray.reduce((acc, val) => acc + val, 0) / bufferLength;
	amplitudeSum += average;
	count++;

	if (count >= 100) { // 100 * 50ms (0.05seconds) = 5 seconds
		backgroundAmplitude = Math.max(10, 1.7 * (amplitudeSum / count)); //the avearge initial amplitude detected. ie the background noise. Adding 70% to it to give a threshold buffer and setting the min to 10 so high quality mics/very quiet environments don't allow the avg to be 0.
		clearInterval(timer);
		resolve();
		whatsHappeningDiv.innerHTML = 'I\'m ready to listen.';
	}
};

Listening for Input. Next, the script simply listens to the audio stream indefinitely until the amplitude exceeds the baseline threshold defined above. The incoming amplitude is displayed to the user alongside the threshold aplitude. The user can recalibrate the baseline threshold by simply refreshing.

function updateAmplitude(stream) {
	const audioContext = new AudioContext();
	const analyser = audioContext.createAnalyser();
	const microphone = audioContext.createMediaStreamSource(stream);
	microphone.connect(analyser);
	analyser.fftSize = 32; //It's better to keep this low so the response generation is faster (less to analyze and average)
	const bufferLength = analyser.frequencyBinCount;
	const dataArray = new Uint8Array(bufferLength);

	const checkAmplitude = () => {
		analyser.getByteFrequencyData(dataArray);
		//console.log('Frequency Data Array:', dataArray); // Log the array data
		const average = dataArray.reduce((acc, val) => acc + val, 0) / bufferLength;
		const amplitudeDisplay = document.getElementById('amplitudeDisplay');
		amplitudeDisplay.textContent = 'Amplitude: ' + average.toFixed(0) + '. Threshold: ' + Math.round(backgroundAmplitude) + '.';

			if (average > backgroundAmplitude && !isRecording && !isPaused && !isWaitingForResponse && !isAudioPlaying) {
				startRecording();
				console.log('Recording STARTED due to high amplitude.' + 'recording: ' + isRecording + 'pause:' + isPaused + 'waiting: ' + isWaitingForResponse + 'playing: ' + isAudioPlaying);
			} else if (average < backgroundAmplitude && isRecording && !isPaused) {
				if (!lowAmplitudeStartTime) {
					lowAmplitudeStartTime = Date.now();
				} else if (Date.now() - lowAmplitudeStartTime >= 3000) {
					stopRecording(); //If there's more than 3 seconds of quiet, stop recording.
					console.log('Recording STOPPED due to low amplitude.');
				}
			} else {
				lowAmplitudeStartTime = null;
			}
	};

	timer = setInterval(checkAmplitude, 50);
}

Recording Audio. Once the baseline amplitude threshold has been exceeded, the script begins recording audio. Once the script detects audio below the baseline threshold for >3 seconds *or* if the recording time exceeds 10 seconds, the script stops recording and generates an MP3.

function startRecording() {
	if (!isRecording && !isWaitingForResponse && !isAudioPlaying) {
		mediaRecorder.start();
		isRecording = true;
		whatsHappeningDiv.innerHTML = 'Listening . . .';
		humanTextRequestDiv.innerHTML = '';
		assistantResponseTextDiv.innerHTML = '';
		recordingTimeout = setTimeout(stopRecording, 10000); //If listening for more than 10 seconds, stop.
		lowAmplitudeStartTime = null;
	}
}
...
function saveRecording(blob) {
	isWaitingForResponse = true;
	const xhr = new XMLHttpRequest();
	xhr.onload = function () {
		isWaitingForResponse = false;
		if (xhr.status === 200) {
			const responseJson = JSON.parse(xhr.responseText);
			humanTextRequestDiv.innerHTML = '<strong>What I heard:</strong> ' + responseJson.human_text_request;
			assistantResponseTextDiv.innerHTML = '<strong>My response:</strong> ' + responseJson.assistant_response_text;
			console.log('Recording saved successfully.');
			audioPlayer.src = responseJson.audio_src;
			audioPlayer.load();
			audioPlayer.play();
			console.log(responseJson);
			console.log('Audio src updated and reloaded: ' + responseJson.audio_src);
		} else {
			whatsHappeningDiv.innerHTML = 'Failed to save recording: ' + xhr.statusText;
			console.error('Failed to save recording:', xhr.statusText);
		}
	};
	xhr.open('POST', 'write_file.php');
	console.log('File sent to POST handler.');
	whatsHappeningDiv.innerHTML = 'Thinking about what you said . . .';
	xhr.send(blob);
}

Transcribing Audio to Text. From here, we shift from javascript to the PHP handler. The handler leverages OpenAI’s Whisper model and speech to text endpoint to transcribe.

//A function to handle the repeat cURL calls
function callOpenAPI($url, $postData, $headers) {
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_POST, true);
    curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
    curl_setopt($ch, CURLOPT_POSTFIELDS, $postData);
    $response = curl_exec($ch);

    if (curl_errno($ch)) {
        $response = ['errors' => curl_error($ch)];
    }

    curl_close($ch);
    return json_decode($response);
}

// Step 1: Transcribe the input
$postTranscriptionData = [
    'model' => 'whisper-1',
    'file' => curl_file_create($filename),
    'response_format' => 'verbose_json',
    'language' => 'en',
];
$response = callOpenAPI('https://api.openai.com/v1/audio/transcriptions', $postTranscriptionData, $headers);
$text = isset($response->text) ? $response->text : '';
$json_summary->human_text_request = $text;

Prompting Chat GPT for Response. I wanted some quirky feedback so I tweaked my prompt to return some dad jokes but otheriwse, it’s pretty straight forward using the OpenAI Completions endpoint.

// Step 2: Generate response from Chat GPT
$headers = [
    "Content-Type: application/json",
    "Authorization: Bearer $openAIToken",
];
$postGPTData = [
    'model' => 'gpt-4-turbo-preview',
    'messages' => [
        ['role' => 'system', 'content' => "You're a virtual assistant who is interpreting voice requests from users. They like to joke and enjoy sarcasm but appreciate factual and succinct responses. Keep your responses to their requests and questions to less than 100 words unless they ask for something longer. They love a good edgy or sarcastic dad joke if you can incorporate one as part of your response -- but don't make it too corny."],
        ['role' => 'user', 'content' => $text],
    ],
];
$response = callOpenAPI('https://api.openai.com/v1/chat/completions', json_encode($postGPTData), $headers);
$assistant_response = isset($response->choices[0]->message->content) ? $response->choices[0]->message->content : '';
$json_summary->assistant_response_text = $assistant_response;

Generating Text to Speech. Lastly, we convert the Chat GPT response to speech and save it as an MP3 file, as well. All of these are then bundled and passed back to the javascript as json and then updated and played for the user.

// Step 3: Generate speech response
$postTTSData = [
    'model' => 'tts-1',
    'input' => $assistant_response,
    'voice' => 'onyx',
];
$response = callOpenAPI('https://api.openai.com/v1/audio/speech', json_encode($postTTSData), $headers);
file_put_contents("recordings/{$file_id}_response.mp3", $response);
//$json_summary->test = $response;
$json_summary->audio_src = "recordings/{$file_id}_response.mp3";

The full javascript:

document.addEventListener('DOMContentLoaded', () => {
	const audioPlayer = document.getElementById('responseAudio');
	const whatsHappeningDiv = document.getElementById('whats_happening');
	const humanTextRequestDiv = document.getElementById('human_text_requestDiv');
	const assistantResponseTextDiv = document.getElementById('assistant_response_textDiv');
	const toggleMute = document.getElementById('toggle_mute');

	let mediaRecorder;
	let audioChunks = [];
	let isRecording = false;
	let isPaused = false;
	let isWaitingForResponse = false;
	let isAudioPlaying = false;
	let timer;
	let recordingTimeout;
	let lowAmplitudeStartTime;
	let backgroundAmplitude = 0;

	function calculateBackgroundAmplitude(stream) {
		return new Promise((resolve, reject) => {
			const audioContext = new AudioContext();
			const analyser = audioContext.createAnalyser();
			const microphone = audioContext.createMediaStreamSource(stream);
			microphone.connect(analyser);

			analyser.fftSize = 32; //It's better to keep this low so the response generation is faster (less to analyze and average)

			const bufferLength = analyser.frequencyBinCount;
			const dataArray = new Uint8Array(bufferLength);

			let amplitudeSum = 0;
			let count = 0;

			const checkAmplitude = () => {
				whatsHappeningDiv.innerHTML = 'Calibrating . . .';
				analyser.getByteFrequencyData(dataArray);
				const average = dataArray.reduce((acc, val) => acc + val, 0) / bufferLength;
				amplitudeSum += average;
				count++;

				if (count >= 100) { // 100 * 50ms (0.05seconds) = 5 seconds
					backgroundAmplitude = Math.max(10, 1.7 * (amplitudeSum / count)); //the avearge initial amplitude detected. ie the background noise. Adding 70% to it to give a threshold buffer and setting the min to 10 so high quality mics/very quiet environments don't allow the avg to be 0.
					clearInterval(timer);
					resolve();
					whatsHappeningDiv.innerHTML = 'I\'m ready to listen.';
				}
			};

			timer = setInterval(checkAmplitude, 50);
		});
	}

	if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
		navigator.mediaDevices.getUserMedia({ audio: true })
			.then(async stream => {
				await calculateBackgroundAmplitude(stream); // Call function to calculate background amplitude
				updateAmplitude(stream);
				mediaRecorder = new MediaRecorder(stream);
				mediaRecorder.ondataavailable = event => {
					audioChunks.push(event.data);
				};
				mediaRecorder.onstop = () => {
					const audioBlob = new Blob(audioChunks, { 'type': 'audio/mp3' });
					saveRecording(audioBlob);
					audioChunks = [];
				};
			})
			.catch(error => {
				console.error('Error accessing microphone:', error);
				whatsHappeningDiv.innerHTML = 'Please allow microphone access and then refresh the page if needed.';
			});
	} else {
		console.error('getUserMedia not supported in this browser.');
		whatsHappeningDiv.innerHTML = 'Your browser isn\'t supported.';
	}

	function startRecording() {
		if (!isRecording && !isWaitingForResponse && !isAudioPlaying) {
			mediaRecorder.start();
			isRecording = true;
			whatsHappeningDiv.innerHTML = 'Listening . . .';
			humanTextRequestDiv.innerHTML = '';
			assistantResponseTextDiv.innerHTML = '';
			recordingTimeout = setTimeout(stopRecording, 10000); //If listening for more than 10 seconds, stop.
			lowAmplitudeStartTime = null;
		}
	}

	function stopRecording() {
		if (isRecording) {
			clearTimeout(recordingTimeout);
			whatsHappeningDiv.innerHTML = 'No longer listening . . .';
			mediaRecorder.stop();
			isRecording = false;
		}
	}

	/*
	I intend to add ability to mute TTS audio playback at some point.
	function muteAudio() {
		audioPlayer.muted = !audioPlayer.muted;
		toggleMute.innerHTML = audioPlayer.muted ? '<a onclick="muteAudio()">Toggle Mute Version 1</a>' : '<a onclick="muteAudio()">Toggle Mute Version 2</a>';
	}
	*/
	
	function handleAudioEvent(event) {
		if (event.type === 'play') {
			isAudioPlaying = true;
			console.log('Audio is playing.');
		} else if (event.type === 'ended') {
			isAudioPlaying = false;
			console.log('Audio has stopped playing.');
			whatsHappeningDiv.innerHTML = 'I\'m ready to listen again.';
		}
	}

	audioPlayer.addEventListener('play', handleAudioEvent);
	audioPlayer.addEventListener('ended', handleAudioEvent);

	function saveRecording(blob) {
		isWaitingForResponse = true;
		const xhr = new XMLHttpRequest();
		xhr.onload = function () {
			isWaitingForResponse = false;
			if (xhr.status === 200) {
				const responseJson = JSON.parse(xhr.responseText);
				humanTextRequestDiv.innerHTML = '<strong>What I heard:</strong> ' + responseJson.human_text_request;
				assistantResponseTextDiv.innerHTML = '<strong>My response:</strong> ' + responseJson.assistant_response_text;
				console.log('Recording saved successfully.');
				audioPlayer.src = responseJson.audio_src;
				audioPlayer.load();
				audioPlayer.play();
				console.log(responseJson);
				console.log('Audio src updated and reloaded: ' + responseJson.audio_src);
			} else {
				whatsHappeningDiv.innerHTML = 'Failed to save recording: ' + xhr.statusText;
				console.error('Failed to save recording:', xhr.statusText);
			}
		};
		xhr.open('POST', 'write_file.php');
		console.log('File sent to POST handler.');
		whatsHappeningDiv.innerHTML = 'Thinking about what you said . . .';
		xhr.send(blob);
	}

	function updateAmplitude(stream) {
		const audioContext = new AudioContext();
		const analyser = audioContext.createAnalyser();
		const microphone = audioContext.createMediaStreamSource(stream);
		microphone.connect(analyser);
		analyser.fftSize = 32; //It's better to keep this low so the response generation is faster (less to analyze and average)
		const bufferLength = analyser.frequencyBinCount;
		const dataArray = new Uint8Array(bufferLength);

		const checkAmplitude = () => {
			analyser.getByteFrequencyData(dataArray);
			//console.log('Frequency Data Array:', dataArray); // Log the array data
			const average = dataArray.reduce((acc, val) => acc + val, 0) / bufferLength;
			const amplitudeDisplay = document.getElementById('amplitudeDisplay');
			amplitudeDisplay.textContent = 'Amplitude: ' + average.toFixed(0) + '. Threshold: ' + Math.round(backgroundAmplitude) + '.';

				if (average > backgroundAmplitude && !isRecording && !isPaused && !isWaitingForResponse && !isAudioPlaying) {
					startRecording();
					console.log('Recording STARTED due to high amplitude.' + 'recording: ' + isRecording + 'pause:' + isPaused + 'waiting: ' + isWaitingForResponse + 'playing: ' + isAudioPlaying);
				} else if (average < backgroundAmplitude && isRecording && !isPaused) {
					if (!lowAmplitudeStartTime) {
						lowAmplitudeStartTime = Date.now();
					} else if (Date.now() - lowAmplitudeStartTime >= 3000) {
						stopRecording(); //If there's more than 3 seconds of quiet, stop recording.
						console.log('Recording STOPPED due to low amplitude.');
					}
				} else {
					lowAmplitudeStartTime = null;
				}
		};

		timer = setInterval(checkAmplitude, 50);
	}

});

The full PHP script:

<?php
//A function to handle the repeat cURL calls
function callOpenAPI($url, $postData, $headers) {
    $ch = curl_init();
    curl_setopt($ch, CURLOPT_URL, $url);
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
    curl_setopt($ch, CURLOPT_POST, true);
    curl_setopt($ch, CURLOPT_HTTPHEADER, $headers);
    curl_setopt($ch, CURLOPT_POSTFIELDS, $postData);
    $response = curl_exec($ch);

    if (curl_errno($ch)) {
        $response = ['errors' => curl_error($ch)];
    }

    curl_close($ch);
    return json_decode($response);
}

//Handing the audio input
$audio_data = file_get_contents('php://input');
$file_id = uniqid();
$filename = "recordings/$file_id.mp3";
file_put_contents($filename, $audio_data);

//Setting the standard header and prepping the response
$json_summary = new stdClass();
$openAIToken="";
$headers = [
    "Authorization: Bearer $openAIToken",
];

////////////////////////////////////////////////////////////
// Step 1: Transcribe the input
$postTranscriptionData = [
    'model' => 'whisper-1',
    'file' => curl_file_create($filename),
    'response_format' => 'verbose_json',
    'language' => 'en',
];
$response = callOpenAPI('https://api.openai.com/v1/audio/transcriptions', $postTranscriptionData, $headers);
$text = isset($response->text) ? $response->text : '';
$json_summary->human_text_request = $text;

////////////////////////////////////////////////////////////
// Step 2: Generate response from Chat GPT
$headers = [
    "Content-Type: application/json",
    "Authorization: Bearer $openAIToken",
];
$postGPTData = [
    'model' => 'gpt-4-turbo-preview',
    'messages' => [
        ['role' => 'system', 'content' => "You're a virtual assistant who is interpreting voice requests from users. They like to joke and enjoy sarcasm but appreciate factual and succinct responses. Keep your responses to their requests and questions to less than 100 words unless they ask for something longer. They love a good edgy or sarcastic dad joke if you can incorporate one as part of your response -- but don't make it too corny."],
        ['role' => 'user', 'content' => $text],
    ],
];
$response = callOpenAPI('https://api.openai.com/v1/chat/completions', json_encode($postGPTData), $headers);
$assistant_response = isset($response->choices[0]->message->content) ? $response->choices[0]->message->content : '';
$json_summary->assistant_response_text = $assistant_response;

////////////////////////////////////////////////////////////
// Step 3: Generate speech response
$postTTSData = [
    'model' => 'tts-1',
    'input' => $assistant_response,
    'voice' => 'onyx',
];
$response = callOpenAPI('https://api.openai.com/v1/audio/speech', json_encode($postTTSData), $headers);
file_put_contents("recordings/{$file_id}_response.mp3", $response);
//$json_summary->test = $response;
$json_summary->audio_src = "recordings/{$file_id}_response.mp3";

////////////////////////////////////////////////////////////
// Step 4: Return json
echo json_encode($json_summary);
?>

If you wanted to use the same CSS stylying I have for my demo, here’s the full thing:

<!DOCTYPE html>
<html lang="en">
<head>
    <meta charset="UTF-8">
    <meta name="viewport" content="width=device-width, initial-scale=1.0">
    <title>The Better Alexa</title>
    <style>
        body {
            display: flex;
            justify-content: center;
            align-items: center;
            height: 100vh;
            margin: 0;
            background: linear-gradient(to bottom, #1f0036, #000000);
        }

        .centered-content {
            text-align: center;
            color: white;
            font-family: Helvetica, sans-serif;
        }

        .whats_happening {
            padding-bottom: 50px;
            font-size: 30px;
            font-weight: bold;

            --background: linear-gradient(to right, #553c9a 20%, #FFCC00 40%, #ee4b2b 60%, #ee4b2b 80%);
            background: linear-gradient(to right, #FEAC5E 20%, #C779D0 40%, #4BC0C8 60%, #FEAC5E 80%);
            background-size: 200% auto;

            color: #000;
            background-clip: text;
            -webkit-background-clip: text;
            text-fill-color: transparent;
            -webkit-text-fill-color: transparent;

            animation: shine 20s linear infinite;
        }

        @keyframes shine {
            to {
                background-position: 200% center;
            }
        }

        .toggle_mute {
            color: #ffffff;
            --font-size:80px;
            padding-bottom: 20px;
        }
        .human_text_requestDiv {
            color: #777;
            padding-bottom: 20px;
        }
        .assistant_response_textDiv {
            color: #ffffff;
        }
        .amplitudeDisplay {
            color: #555;
            font-size: 10px;
            padding-top: 50px;
        }

    </style>
</head>
<body>
    <div class="centered-content">
        <audio src="#" controls id="responseAudio" name="responseAudio" style="display: none;"></audio>
        <div id="whats_happening" class="whats_happening">Checking mic . . .</div>
        <div id="toggle_mute" class="toggle_mute"></div>
        <div id="human_text_requestDiv" class="human_text_requestDiv"></div>
        <div id="assistant_response_textDiv" class="assistant_response_textDiv"></div>
        <div id="amplitudeDisplay" class="amplitudeDisplay"></div>
    </div>
</body>
<script>
    document.addEventListener('DOMContentLoaded', () => {
        const audioPlayer = document.getElementById('responseAudio');
        const whatsHappeningDiv = document.getElementById('whats_happening');
        const humanTextRequestDiv = document.getElementById('human_text_requestDiv');
        const assistantResponseTextDiv = document.getElementById('assistant_response_textDiv');
        const toggleMute = document.getElementById('toggle_mute');

        let mediaRecorder;
        let audioChunks = [];
        let isRecording = false;
        let isPaused = false;
        let isWaitingForResponse = false;
        let isAudioPlaying = false;
        let timer;
        let recordingTimeout;
        let lowAmplitudeStartTime;
        let backgroundAmplitude = 0;

        function calculateBackgroundAmplitude(stream) {
            return new Promise((resolve, reject) => {
                const audioContext = new AudioContext();
                const analyser = audioContext.createAnalyser();
                const microphone = audioContext.createMediaStreamSource(stream);
                microphone.connect(analyser);

                analyser.fftSize = 32; //It's better to keep this low so the response generation is faster (less to analyze and average)

                const bufferLength = analyser.frequencyBinCount;
                const dataArray = new Uint8Array(bufferLength);

                let amplitudeSum = 0;
                let count = 0;

                const checkAmplitude = () => {
					whatsHappeningDiv.innerHTML = 'Calibrating . . .';
                    analyser.getByteFrequencyData(dataArray);
                    const average = dataArray.reduce((acc, val) => acc + val, 0) / bufferLength;
                    amplitudeSum += average;
                    count++;

                    if (count >= 100) { // 100 * 50ms (0.05seconds) = 5 seconds
                        backgroundAmplitude = Math.max(10, 1.7 * (amplitudeSum / count)); //the avearge initial amplitude detected. ie the background noise. Adding 70% to it to give a threshold buffer and setting the min to 10 so high quality mics/very quiet environments don't allow the avg to be 0.
                        clearInterval(timer);
                        resolve();
						whatsHappeningDiv.innerHTML = 'I\'m ready to listen.';
                    }
                };

                timer = setInterval(checkAmplitude, 50);
            });
        }

        if (navigator.mediaDevices && navigator.mediaDevices.getUserMedia) {
            navigator.mediaDevices.getUserMedia({ audio: true })
                .then(async stream => {
                    await calculateBackgroundAmplitude(stream); // Call function to calculate background amplitude
                    updateAmplitude(stream);
                    mediaRecorder = new MediaRecorder(stream);
                    mediaRecorder.ondataavailable = event => {
                        audioChunks.push(event.data);
                    };
                    mediaRecorder.onstop = () => {
                        const audioBlob = new Blob(audioChunks, { 'type': 'audio/mp3' });
                        saveRecording(audioBlob);
                        audioChunks = [];
                    };
                })
                .catch(error => {
                    console.error('Error accessing microphone:', error);
                    whatsHappeningDiv.innerHTML = 'Please allow microphone access and then refresh the page if needed.';
                });
        } else {
            console.error('getUserMedia not supported in this browser.');
            whatsHappeningDiv.innerHTML = 'Your browser isn\'t supported.';
        }

        function startRecording() {
            if (!isRecording && !isWaitingForResponse && !isAudioPlaying) {
                mediaRecorder.start();
                isRecording = true;
                whatsHappeningDiv.innerHTML = 'Listening . . .';
                humanTextRequestDiv.innerHTML = '';
                assistantResponseTextDiv.innerHTML = '';
                recordingTimeout = setTimeout(stopRecording, 10000); //If listening for more than 10 seconds, stop.
                lowAmplitudeStartTime = null;
            }
        }

        function stopRecording() {
            if (isRecording) {
                clearTimeout(recordingTimeout);
                whatsHappeningDiv.innerHTML = 'No longer listening . . .';
                mediaRecorder.stop();
                isRecording = false;
            }
        }

        /*
		I intend to add ability to mute TTS audio playback at some point.
		function muteAudio() {
            audioPlayer.muted = !audioPlayer.muted;
            toggleMute.innerHTML = audioPlayer.muted ? '<a onclick="muteAudio()">Toggle Mute Version 1</a>' : '<a onclick="muteAudio()">Toggle Mute Version 2</a>';
        }
		*/
		
        function handleAudioEvent(event) {
            if (event.type === 'play') {
                isAudioPlaying = true;
                console.log('Audio is playing.');
            } else if (event.type === 'ended') {
                isAudioPlaying = false;
                console.log('Audio has stopped playing.');
                whatsHappeningDiv.innerHTML = 'I\'m ready to listen again.';
            }
        }

        audioPlayer.addEventListener('play', handleAudioEvent);
        audioPlayer.addEventListener('ended', handleAudioEvent);

        function saveRecording(blob) {
            isWaitingForResponse = true;
            const xhr = new XMLHttpRequest();
            xhr.onload = function () {
                isWaitingForResponse = false;
                if (xhr.status === 200) {
                    const responseJson = JSON.parse(xhr.responseText);
                    humanTextRequestDiv.innerHTML = '<strong>What I heard:</strong> ' + responseJson.human_text_request;
                    assistantResponseTextDiv.innerHTML = '<strong>My response:</strong> ' + responseJson.assistant_response_text;
                    console.log('Recording saved successfully.');
                    audioPlayer.src = responseJson.audio_src;
                    audioPlayer.load();
                    audioPlayer.play();
                    console.log(responseJson);
                    console.log('Audio src updated and reloaded: ' + responseJson.audio_src);
                } else {
                    whatsHappeningDiv.innerHTML = 'Failed to save recording: ' + xhr.statusText;
                    console.error('Failed to save recording:', xhr.statusText);
                }
            };
            xhr.open('POST', 'write_file.php');
            console.log('File sent to POST handler.');
            whatsHappeningDiv.innerHTML = 'Thinking about what you said . . .';
            xhr.send(blob);
        }

        function updateAmplitude(stream) {
            const audioContext = new AudioContext();
            const analyser = audioContext.createAnalyser();
            const microphone = audioContext.createMediaStreamSource(stream);
            microphone.connect(analyser);
            analyser.fftSize = 32; //It's better to keep this low so the response generation is faster (less to analyze and average)
            const bufferLength = analyser.frequencyBinCount;
            const dataArray = new Uint8Array(bufferLength);

            const checkAmplitude = () => {
                analyser.getByteFrequencyData(dataArray);
                //console.log('Frequency Data Array:', dataArray); // Log the array data
                const average = dataArray.reduce((acc, val) => acc + val, 0) / bufferLength;
                const amplitudeDisplay = document.getElementById('amplitudeDisplay');
                amplitudeDisplay.textContent = 'Amplitude: ' + average.toFixed(0) + '. Threshold: ' + Math.round(backgroundAmplitude) + '.';

                    if (average > backgroundAmplitude && !isRecording && !isPaused && !isWaitingForResponse && !isAudioPlaying) {
                        startRecording();
                        console.log('Recording STARTED due to high amplitude.' + 'recording: ' + isRecording + 'pause:' + isPaused + 'waiting: ' + isWaitingForResponse + 'playing: ' + isAudioPlaying);
                    } else if (average < backgroundAmplitude && isRecording && !isPaused) {
                        if (!lowAmplitudeStartTime) {
                            lowAmplitudeStartTime = Date.now();
                        } else if (Date.now() - lowAmplitudeStartTime >= 3000) {
                            stopRecording(); //If there's more than 3 seconds of quiet, stop recording.
                            console.log('Recording STOPPED due to low amplitude.');
                        }
                    } else {
                        lowAmplitudeStartTime = null;
                    }
            };

            timer = setInterval(checkAmplitude, 50);
        }

    });
</script>
</html>

Category: AI

Voice AI assistant using javascript, PHP, and Chat GPT

Click here to give it a try for yourself!

How it works

The full javascript:

The full PHP script:

If you wanted to use the same CSS stylying I have for my demo, here’s the full thing: