243 lines
7.4 KiB
JavaScript
243 lines
7.4 KiB
JavaScript
const OPENAI_MODELS_BY_QUALITY = {
|
||
easy: 'whisper-1',
|
||
medium: 'gpt-4o-mini-transcribe',
|
||
hard: 'gpt-4o-transcribe',
|
||
};
|
||
|
||
const PIPER_LENGTH_SCALE_BY_QUALITY = {
|
||
easy: '1.15',
|
||
medium: '1.0',
|
||
hard: '0.9',
|
||
};
|
||
|
||
function normalizeOpenAiBaseUrl(url) {
|
||
const value = String(url || '').trim();
|
||
if (!value) return 'https://api.openai.com/v1';
|
||
return value.replace(/\/+$/, '');
|
||
}
|
||
|
||
function resolveSttModel(config) {
|
||
const quality = String(config?.quality || 'medium').toLowerCase();
|
||
const customModel = String(config?.model || '').trim();
|
||
return customModel || OPENAI_MODELS_BY_QUALITY[quality] || OPENAI_MODELS_BY_QUALITY.medium;
|
||
}
|
||
|
||
export function isSpeechToTextConfigured(entrySettings) {
|
||
const cfg = entrySettings?.tools?.speechToText || {};
|
||
if (String(cfg.provider || 'openai') !== 'openai') return false;
|
||
return !!String(cfg.apiKey || '').trim();
|
||
}
|
||
|
||
export function isTextToSpeechConfigured(entrySettings) {
|
||
const cfg = entrySettings?.tools?.textToSpeech || {};
|
||
const provider = String(cfg.provider || 'browser');
|
||
if (provider === 'browser') return true;
|
||
if (provider === 'piper-http') return !!String(cfg.piperBaseUrl || '').trim();
|
||
if (provider === 'openai') return !!String(cfg.apiKey || '').trim();
|
||
return false;
|
||
}
|
||
|
||
export async function transcribeAudioBySettings(audioBlob, entrySettings) {
|
||
const cfg = entrySettings?.tools?.speechToText || {};
|
||
const provider = String(cfg.provider || 'openai');
|
||
if (provider !== 'openai') {
|
||
throw new Error('Поддерживается только провайдер OpenAI для распознавания.');
|
||
}
|
||
|
||
const apiKey = String(cfg.apiKey || '').trim();
|
||
if (!apiKey) throw new Error('Не заполнен OpenAI API key.');
|
||
|
||
const model = resolveSttModel(cfg);
|
||
const baseUrl = normalizeOpenAiBaseUrl(cfg.baseUrl);
|
||
|
||
const form = new FormData();
|
||
form.append('model', model);
|
||
form.append('language', 'ru');
|
||
form.append('response_format', 'json');
|
||
form.append('file', audioBlob, 'voice-input.webm');
|
||
|
||
const response = await fetch(`${baseUrl}/audio/transcriptions`, {
|
||
method: 'POST',
|
||
headers: {
|
||
Authorization: `Bearer ${apiKey}`,
|
||
},
|
||
body: form,
|
||
});
|
||
|
||
if (!response.ok) {
|
||
const body = await response.text().catch(() => '');
|
||
throw new Error(`Ошибка STT API (${response.status}): ${body || 'unknown error'}`);
|
||
}
|
||
|
||
const payload = await response.json();
|
||
const text = String(payload?.text || '').trim();
|
||
if (!text) throw new Error('Пустой ответ распознавания.');
|
||
return text;
|
||
}
|
||
|
||
export function createMicrophoneRecorder() {
|
||
const Ctx = window.AudioContext || window.webkitAudioContext;
|
||
let stream = null;
|
||
let recorder = null;
|
||
let startedAtMs = 0;
|
||
let chunks = [];
|
||
let timerId = 0;
|
||
let level = 0;
|
||
let analyser = null;
|
||
let rafId = 0;
|
||
|
||
async function start(onTick) {
|
||
stream = await navigator.mediaDevices.getUserMedia({ audio: true, video: false });
|
||
recorder = new MediaRecorder(stream, { mimeType: 'audio/webm' });
|
||
startedAtMs = Date.now();
|
||
chunks = [];
|
||
|
||
recorder.ondataavailable = (event) => {
|
||
if (event?.data?.size > 0) chunks.push(event.data);
|
||
};
|
||
recorder.start(250);
|
||
|
||
timerId = window.setInterval(() => {
|
||
if (typeof onTick === 'function') {
|
||
onTick({
|
||
elapsedMs: Date.now() - startedAtMs,
|
||
level,
|
||
});
|
||
}
|
||
}, 120);
|
||
|
||
if (Ctx) {
|
||
const audioCtx = new Ctx();
|
||
const source = audioCtx.createMediaStreamSource(stream);
|
||
analyser = audioCtx.createAnalyser();
|
||
analyser.fftSize = 256;
|
||
source.connect(analyser);
|
||
const data = new Uint8Array(analyser.frequencyBinCount);
|
||
const read = () => {
|
||
if (!analyser) return;
|
||
analyser.getByteFrequencyData(data);
|
||
let sum = 0;
|
||
for (let i = 0; i < data.length; i += 1) sum += data[i];
|
||
level = data.length > 0 ? Math.max(0, Math.min(1, (sum / data.length) / 255)) : 0;
|
||
rafId = window.requestAnimationFrame(read);
|
||
};
|
||
read();
|
||
}
|
||
}
|
||
|
||
async function stop() {
|
||
if (!recorder) return null;
|
||
const blob = await new Promise((resolve) => {
|
||
recorder.onstop = () => resolve(new Blob(chunks, { type: 'audio/webm' }));
|
||
recorder.stop();
|
||
});
|
||
cleanup();
|
||
return blob;
|
||
}
|
||
|
||
function cancel() {
|
||
try {
|
||
recorder?.stop();
|
||
} catch {
|
||
// ignore
|
||
}
|
||
cleanup();
|
||
}
|
||
|
||
function cleanup() {
|
||
if (timerId) window.clearInterval(timerId);
|
||
if (rafId) window.cancelAnimationFrame(rafId);
|
||
timerId = 0;
|
||
rafId = 0;
|
||
analyser = null;
|
||
if (stream) {
|
||
stream.getTracks().forEach((track) => {
|
||
try {
|
||
track.stop();
|
||
} catch {
|
||
// ignore
|
||
}
|
||
});
|
||
}
|
||
stream = null;
|
||
recorder = null;
|
||
}
|
||
|
||
return { start, stop, cancel };
|
||
}
|
||
|
||
export async function speakTextBySettings(text, entrySettings) {
|
||
const value = String(text || '').trim();
|
||
if (!value) return;
|
||
|
||
const cfg = entrySettings?.tools?.textToSpeech || {};
|
||
const provider = String(cfg.provider || 'browser');
|
||
|
||
if (provider === 'browser') {
|
||
const utt = new SpeechSynthesisUtterance(value);
|
||
utt.lang = 'ru-RU';
|
||
const selected = String(cfg.voice || '').trim();
|
||
if (selected) {
|
||
const voice = window.speechSynthesis.getVoices().find((v) => v.name === selected);
|
||
if (voice) utt.voice = voice;
|
||
}
|
||
window.speechSynthesis.speak(utt);
|
||
return;
|
||
}
|
||
|
||
if (provider === 'piper-http') {
|
||
const baseUrl = String(cfg.piperBaseUrl || '').trim().replace(/\/+$/, '');
|
||
if (!baseUrl) throw new Error('Не указан адрес Piper HTTP.');
|
||
const quality = String(cfg.quality || 'medium').toLowerCase();
|
||
const voice = String(cfg.voice || '').trim();
|
||
const lengthScale = PIPER_LENGTH_SCALE_BY_QUALITY[quality] || PIPER_LENGTH_SCALE_BY_QUALITY.medium;
|
||
const resp = await fetch(`${baseUrl}/api/tts`, {
|
||
method: 'POST',
|
||
headers: { 'Content-Type': 'application/json' },
|
||
body: JSON.stringify({
|
||
text: value,
|
||
voice,
|
||
quality,
|
||
length_scale: lengthScale,
|
||
}),
|
||
});
|
||
if (!resp.ok) throw new Error(`Piper HTTP недоступен (${resp.status}).`);
|
||
const blob = await resp.blob();
|
||
const audioUrl = URL.createObjectURL(blob);
|
||
const audio = new Audio(audioUrl);
|
||
await audio.play();
|
||
window.setTimeout(() => URL.revokeObjectURL(audioUrl), 30000);
|
||
return;
|
||
}
|
||
|
||
if (provider === 'openai') {
|
||
const apiKey = String(cfg.apiKey || '').trim();
|
||
if (!apiKey) throw new Error('Не заполнен API key для OpenAI TTS.');
|
||
const model = String(cfg.model || '').trim() || 'gpt-4o-mini-tts';
|
||
const baseUrl = normalizeOpenAiBaseUrl(cfg.externalBaseUrl || cfg.baseUrl || 'https://api.openai.com/v1');
|
||
const voice = String(cfg.voice || '').trim() || 'alloy';
|
||
const resp = await fetch(`${baseUrl}/audio/speech`, {
|
||
method: 'POST',
|
||
headers: {
|
||
Authorization: `Bearer ${apiKey}`,
|
||
'Content-Type': 'application/json',
|
||
},
|
||
body: JSON.stringify({
|
||
model,
|
||
voice,
|
||
input: value,
|
||
format: 'mp3',
|
||
}),
|
||
});
|
||
if (!resp.ok) throw new Error(`OpenAI TTS недоступен (${resp.status}).`);
|
||
const blob = await resp.blob();
|
||
const audioUrl = URL.createObjectURL(blob);
|
||
const audio = new Audio(audioUrl);
|
||
await audio.play();
|
||
window.setTimeout(() => URL.revokeObjectURL(audioUrl), 30000);
|
||
return;
|
||
}
|
||
|
||
throw new Error('Неизвестный провайдер озвучки.');
|
||
}
|