API 키 및 음성 인식 모델 설정 추가: Hugging Face API 키와 모델 제공자 설정을 config.ini에 추가하고, main.py에서 관련 설정을 반영하였습니다. 음성 인식 기능에 Hugging Face와 VOSK 모델 지원을 추가하였습니다.

2025-05-02 23:54:37 +09:00 · 2025-05-02 23:54:37 +09:00 · bdbd7cb6f1
parent 6f4309a616
commit bdbd7cb6f1
6 changed files with 618 additions and 153 deletions
--- a/config.ini
+++ b/config.ini
@ -1,5 +1,6 @@
 [api]
 openai_api_key = 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
+huggingface_api_key = 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

 [audio]
 sample_rate = 16000
@ -10,3 +11,7 @@ buffer_duration = 3
 [app]
 theme = light

+[model]
+provider = huggingface
+name = kresnik/wav2vec2-large-xlsr-korean
+
--- a/conversation_logs/conversation_20250502211201.txt
+++ b/conversation_logs/conversation_20250502211201.txt
@ -0,0 +1,6 @@
+대화 ID: 20250502211201
+시작 시간: 2025-05-02 21:12:01
+
+--- 대화 내용 ---
+
+[21:12:01] 알 수 없음: EE
--- a/main.py
+++ b/main.py
@ -23,7 +23,8 @@ def main(page: ft.Page):
    # 기본 설정 생성
    if not os.path.exists("config.ini"):
        config["api"] = {
-            "openai_api_key": "your_openai_api_key_here"
+            "openai_api_key": "your_openai_api_key_here",
+            "huggingface_api_key": "your_huggingface_api_key_here"
        }
        config["audio"] = {
            "sample_rate": "16000",
@ -34,6 +35,10 @@ def main(page: ft.Page):
        config["app"] = {
            "theme": "light"
        }
+        config["model"] = {
+            "provider": "huggingface",
+            "name": "facebook/wav2vec2-base-960h"
+        }
        
        with open("config.ini", "w") as configfile:
            config.write(configfile)
@ -178,36 +183,47 @@ def main(page: ft.Page):
            status_indicator.set_detecting(True)
            page.update()
            
-            # MP3 또는 WAV 파일 처리
-            if file_path.lower().endswith('.mp3'):
-                # MP3를 WAV로 변환 (librosa 사용)
-                import librosa
-                
-                # MP3 파일 로드
-                audio_data, sample_rate = librosa.load(file_path, sr=16000, mono=True)
-                
-                # WAV 파일로 변환
-                with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
-                    temp_wav_path = temp_file.name
+            # MP3 또는 WAV 파일 직접 처리
+            text = speech_recognizer.recognize_file(file_path)
+            
+            # 변환 실패 시 MP3 파일인 경우 추가 처리
+            if text is None and file_path.lower().endswith('.mp3'):
+                try:
+                    # MP3를 WAV로 변환 (librosa 사용)
+                    import librosa
                    
-                    # 16비트 정수로 변환
-                    audio_data_int = (audio_data * 32767).astype(np.int16)
+                    # MP3 파일 로드
+                    audio_data, sample_rate = librosa.load(file_path, sr=16000, mono=True)
                    
-                    # WAV 파일 저장 - 쓰기 모드('wb')로 열기
-                    with wave.open(temp_wav_path, 'wb') as wf:
-                        wf.setnchannels(1)
-                        wf.setsampwidth(2)  # 16-bit
-                        wf.setframerate(16000)
-                        wf.writeframes(audio_data_int.tobytes())
+                    # WAV 파일로 변환
+                    fd, temp_wav_path = tempfile.mkstemp(suffix=".wav")
+                    os.close(fd)  # 파일 디스크립터 즉시 닫기
                    
-                    # OpenAI의 API를 사용해 음성 인식
-                    text = speech_recognizer.recognize_file(temp_wav_path)
-                    
-                    # 임시 파일 삭제
-                    os.unlink(temp_wav_path)
-            else:
-                # WAV 파일 직접 처리
-                text = speech_recognizer.recognize_file(file_path)
+                    try:
+                        # 16비트 정수로 변환
+                        audio_data_int = (audio_data * 32767).astype(np.int16)
+                        
+                        # WAV 파일 저장 - Wave_write 객체 사용
+                        wf = wave.open(temp_wav_path, 'wb')
+                        try:
+                            wf.setnchannels(1)
+                            wf.setsampwidth(2)  # 16-bit
+                            wf.setframerate(16000)
+                            wf.writeframes(audio_data_int.tobytes())
+                        finally:
+                            wf.close()  # 명시적으로 닫아줌
+                        
+                        # OpenAI의 API를 사용해 음성 인식
+                        text = speech_recognizer.recognize_file(temp_wav_path)
+                    finally:
+                        # 임시 파일 항상 삭제
+                        if os.path.exists(temp_wav_path):
+                            try:
+                                os.unlink(temp_wav_path)
+                            except Exception as e:
+                                print(f"임시 파일 삭제 오류: {e}")
+                except Exception as mp3_error:
+                    print(f"MP3 처리 오류: {mp3_error}")
            
            # 인식 결과 처리
            if text:
@ -346,13 +362,49 @@ def main(page: ft.Page):
        """설정 대화상자 표시"""
        try:
            # API 키 입력 필드
-            api_key_field = ft.TextField(
+            openai_api_key_field = ft.TextField(
                label="OpenAI API 키",
                value=speech_recognizer.api_key,
                password=True,
                width=400
            )
            
+            # Hugging Face API 키 입력 필드
+            hf_api_key_field = ft.TextField(
+                label="Hugging Face API 키",
+                value=speech_recognizer.hf_api_key,
+                password=True,
+                width=400
+            )
+            
+            # 모델 선택 관련 설정
+            # 모델 제공자 선택
+            model_provider_radio = ft.RadioGroup(
+                content=ft.Column([
+                    ft.Radio(value="huggingface", label="Hugging Face (한국어 음성인식 권장)"),
+                    ft.Radio(value="openai", label="OpenAI Whisper (백업)"),
+                    ft.Radio(value="vosk", label="VOSK 오프라인 (완전 오프라인)"),
+                ]),
+                value=speech_recognizer.model_provider
+            )
+            
+            # 음성인식 모델 선택 드롭다운
+            model_options = [
+                ft.dropdown.Option("vosk-model-small-kr", "VOSK 모델 - 완전 오프라인 한국어 소형 모델"),
+                ft.dropdown.Option("kresnik/wav2vec2-large-xlsr-korean", "한국어 특화 음성인식 모델"),
+                ft.dropdown.Option("openai/whisper-small", "openai 한국어 특화 Whisper 소형 모델"),
+                ft.dropdown.Option("openai/whisper-medium", "openai 한국어 특화 Whisper 중형 모델"),
+                ft.dropdown.Option("openai/whisper-large-v3", "openai 한국어 특화 Whisper 대형 모델"),
+                ft.dropdown.Option("facebook/wav2vec2-base-960h", "영어 음성인식 기본 모델")
+            ]
+            
+            model_dropdown = ft.Dropdown(
+                label="음성인식 모델 선택",
+                width=400,
+                options=model_options,
+                value=speech_recognizer.model_name,
+            )
+            
            # 오디오 설정 슬라이더
            silence_threshold_slider = ft.Slider(
                min=0.01,
@ -413,8 +465,19 @@ def main(page: ft.Page):
            def save_settings(e):
                try:
                    # API 키 저장
-                    if api_key_field.value:
-                        speech_recognizer.set_api_key(api_key_field.value)
+                    if openai_api_key_field.value:
+                        speech_recognizer.set_api_key(openai_api_key_field.value)
+                    
+                    # Hugging Face API 키 저장    
+                    if hf_api_key_field.value:
+                        speech_recognizer.set_huggingface_api_key(hf_api_key_field.value)
+                    
+                    # 모델 설정 저장
+                    if model_provider_radio.value and model_dropdown.value:
+                        speech_recognizer.set_model(
+                            model_provider_radio.value,
+                            model_dropdown.value
+                        )
                        
                    # 오디오 설정 저장
                    if hasattr(audio_source, 'update_settings'):
@ -460,9 +523,21 @@ def main(page: ft.Page):
            dialog = ft.AlertDialog(
                title=ft.Text("설정"),
                content=ft.Column([
-                    ft.Text("OpenAI API 설정", weight=ft.FontWeight.BOLD),
-                    api_key_field,
-                    ft.Divider(),
+                    ft.Text("API 설정", weight=ft.FontWeight.BOLD),
+                    ft.Text("OpenAI API (백업용)", size=14),
+                    openai_api_key_field,
+                    ft.Divider(height=10),
+                    ft.Text("Hugging Face API (권장)", size=14),
+                    hf_api_key_field,
+                    
+                    ft.Divider(height=20),
+                    ft.Text("음성인식 모델 설정", weight=ft.FontWeight.BOLD),
+                    ft.Text("API 제공자 선택", size=14),
+                    model_provider_radio,
+                    ft.Text("음성인식 모델 선택", size=14),
+                    model_dropdown,
+                    
+                    ft.Divider(height=20),
                    ft.Text("오디오 설정", weight=ft.FontWeight.BOLD),
                    silence_threshold_text,
                    silence_threshold_slider,
@ -470,11 +545,12 @@ def main(page: ft.Page):
                    silence_duration_slider,
                    buffer_duration_text,
                    buffer_duration_slider,
-                    ft.Divider(),
+                    
+                    ft.Divider(height=20),
                    ft.Text("앱 설정", weight=ft.FontWeight.BOLD),
-                    ft.Text("테마"),
+                    ft.Text("테마", size=14),
                    theme_radio,
-                ], scroll=ft.ScrollMode.AUTO, height=500),
+                ], scroll=ft.ScrollMode.AUTO, height=600),
                actions=[
                    ft.TextButton("취소", on_click=lambda e: setattr(dialog, "open", False)),
                    ft.TextButton("저장", on_click=save_settings),
--- a/modules/speech_recognition.py
+++ b/modules/speech_recognition.py
@ -8,6 +8,9 @@ import struct
 import threading
 import queue
 import time
+import requests
+import json
+from vosk import Model, KaldiRecognizer
 from typing import Optional, Callable

 class SpeechRecognizer:
@ -17,11 +20,30 @@ class SpeechRecognizer:
        if os.path.exists(config_path):
            self.config.read(config_path)
            
+        # OpenAI API 키
        self.api_key = self.config.get("api", "openai_api_key", fallback="")
        
-        if not self.api_key or self.api_key == "your_openai_api_key_here":
-            print("경고: OpenAI API 키가 설정되지 않았습니다. config.ini 파일을 확인하세요.")
+        # Hugging Face API 키
+        self.hf_api_key = self.config.get("api", "huggingface_api_key", fallback="")
        
+        # 모델 선택
+        self.model_provider = self.config.get("model", "provider", fallback="huggingface")
+        self.model_name = self.config.get("model", "name", fallback="facebook/wav2vec2-base-960h")
+        
+        # 한국어 모델 (기본값으로 사용 가능한 한국어 음성인식 모델들)
+        self.korean_models = [
+            "kresnik/wav2vec2-large-xlsr-korean",  # 한국어 특화 모델
+            "openai/whisper-small",              # 한국어 특화 Whisper 모델
+            "openai/whisper-medium",             # 더 큰 한국어 특화 Whisper 모델
+            "openai/whisper-large-v3",     # 다국어 모델 (한국어 포함)
+            "facebook/wav2vec2-base-960h"          # 영어 기본 모델 (참고용)
+        ]
+        
+        # API 키가 설정되지 않았을 때 경고
+        if not self.api_key and not self.hf_api_key:
+            print("경고: API 키가 설정되지 않았습니다. config.ini 파일에 OpenAI API 키 또는 Hugging Face API 키를 설정하세요.")
+        
+        # OpenAI 설정 (백업용)
        openai.api_key = self.api_key
        
        # 실시간 처리를 위한 설정
@ -30,83 +52,294 @@ class SpeechRecognizer:
        self.process_thread = None
        self.callback = None
        
+        # VOSK 모델 오프라인 초기화
+        if self.model_provider.lower() == "vosk":
+            try:
+                self.vosk_model = Model(self.model_name)
+            except Exception as e:
+                print(f"VOSK 모델 로드 오류: {e}")
+                self.vosk_model = None
+        else:
+            self.vosk_model = None
+        
    def recognize(self, audio_data: np.ndarray) -> Optional[str]:
        """오디오 데이터를 텍스트로 변환합니다."""
        if len(audio_data) == 0:
            return None
            
+        temp_file_path = None
        try:
-            # 임시 파일로 오디오 저장
-            with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as temp_file:
-                # WAV 파일 형식으로 저장
-                with wave.open(temp_file.name, 'wb') as wf:
-                    wf.setnchannels(1)
-                    wf.setsampwidth(2)  # 16-bit
-                    wf.setframerate(16000)
+            # 임시 파일 생성
+            fd, temp_file_path = tempfile.mkstemp(suffix=".wav")
+            os.close(fd)  # 파일 디스크립터 즉시 닫기
+            
+            # WAV 파일 작성 (wave 모듈 사용)
+            # 쓰기 모드('wb')로 열었으므로 Wave_write 객체가 반환됩니다
+            wf = wave.open(temp_file_path, 'wb')
+            try:
+                wf.setnchannels(1)  # 모노 채널
+                wf.setsampwidth(2)  # 16-bit
+                wf.setframerate(16000)  # 샘플레이트
                    
-                    # float32 데이터를 int16으로 변환
-                    audio_data_int = (audio_data * 32767).astype(np.int16)
-                    wf.writeframes(audio_data_int.tobytes())
+                # float32 데이터를 int16으로 변환
+                audio_data_int = (audio_data * 32767).astype(np.int16)
+                wf.writeframes(audio_data_int.tobytes())
+            finally:
+                wf.close()  # 명시적으로 닫아줌
+            
+            if self.model_provider.lower() == "vosk":
+                # VOSK 오프라인 모델 사용
+                if self.vosk_model is None:
+                    print("VOSK 모델이 로드되지 않았습니다.")
+                    return None
+                return self._recognize_with_vosk(temp_file_path)
+            elif self.model_provider.lower() == "huggingface" and self.hf_api_key:
+                # Hugging Face 모델을 사용하여 음성 인식
+                return self._recognize_with_huggingface(temp_file_path)
+            elif self.api_key:
+                # OpenAI Whisper API를 백업으로 사용
+                return self._recognize_with_openai(temp_file_path)
+            else:
+                print("사용 가능한 API 키가 없습니다.")
+                return None
                
-                # OpenAI Whisper API를 사용하여 음성 인식
-                with open(temp_file.name, "rb") as audio_file:
-                    try:
-                        if not self.api_key:
-                            print("API 키가 설정되지 않았습니다.")
-                            return None
-                            
-                        # 최신 OpenAI API 사용 방식
-                        try:
-                            client = openai.OpenAI(api_key=self.api_key)
-                            result = client.audio.transcriptions.create(
-                                model="whisper-1",
-                                file=audio_file,
-                                language="ko",
-                                response_format="text"
-                            )
-                            text_result = result if isinstance(result, str) else result.text
-                        except (AttributeError, ImportError, NameError):
-                            # 이전 버전 OpenAI 라이브러리 지원 (최대한 호환성 유지)
-                            try:
-                                # audio 모듈 사용 (최신 버전)
-                                result = openai.audio.transcriptions.create(
-                                    model="whisper-1",
-                                    file=audio_file,
-                                    language="ko",
-                                    response_format="text"
-                                )
-                                text_result = result if isinstance(result, str) else result.text
-                            except (AttributeError, ImportError, NameError):
-                                # Audio 클래스 사용 (이전 버전)
-                                try:
-                                    result = openai.Audio.transcribe(
-                                        model="whisper-1",
-                                        file=audio_file,
-                                        language="ko",
-                                        response_format="text"
-                                    )
-                                    text_result = result if isinstance(result, str) else result.get("text", "")
-                                except:
-                                    print("OpenAI API 호출 방식을 찾을 수 없습니다.")
-                                    return None
-                        
-                        # 임시 파일 삭제
-                        os.unlink(temp_file.name)
-                        
-                        return text_result
-                        
-                    except Exception as e:
-                        print(f"API 호출 오류: {e}")
-                        # 임시 파일 삭제 시도
-                        try:
-                            os.unlink(temp_file.name)
-                        except:
-                            pass
-                        return None
-                        
        except Exception as e:
            print(f"음성 인식 오류: {e}")
            return None
+        finally:
+            # 임시 파일 삭제 시도 (try-finally로 보장)
+            if temp_file_path and os.path.exists(temp_file_path):
+                try:
+                    os.unlink(temp_file_path)
+                except Exception as e:
+                    print(f"임시 파일 삭제 오류: {e}")
+    
+    def _recognize_with_huggingface(self, file_path: str) -> Optional[str]:
+        """Hugging Face API를 사용하여 음성 인식을 수행합니다."""
+        try:
+            # 사용할 모델 결정
+            model_to_use = self.model_name
+            max_retries = 5  # 최대 재시도 횟수를 5회로 늘림
+            
+            # 파일 확장자에 따라 Content-Type 결정
+            content_type = "audio/wav"  # 기본값은 WAV
+            if file_path.lower().endswith(".mp3"):
+                content_type = "audio/mpeg"
+            elif file_path.lower().endswith(".flac"):
+                content_type = "audio/flac"
+            
+            # API 요청 URL (모델 로딩 대기 파라미터 추가)
+            api_url = f"https://api-inference.huggingface.co/models/{model_to_use}?wait_for_model=true"
+            
+            # 파일 크기 확인 및 로그 출력
+            file_size = os.path.getsize(file_path)
+            print(f"[INFO] 파일 크기: {file_size} bytes, 파일 형식: {content_type}")
+            
+            # 파일 로드
+            with open(file_path, "rb") as f:
+                audio_bytes = f.read()
+            
+            # API 호출용 헤더
+            headers = {
+                "Authorization": f"Bearer {self.hf_api_key}",
+                "Content-Type": content_type  # 올바른 Content-Type 설정
+            }
+            
+            print(f"[INFO] 모델 {model_to_use}로 인식 시도")
+            
+            # 첫 번째 시도: 설정된 모델 (Exponential Backoff 적용)
+            for attempt in range(1, max_retries + 1):
+                try:
+                    # Exponential Backoff 대기 시간 계산 (첫 시도는 0초)
+                    backoff_time = 0 if attempt == 1 else 2 ** (attempt - 2)
+                    
+                    print(f"[INFO] 요청 {attempt}/{max_retries}: {model_to_use}" + 
+                          (f" (Backoff: {backoff_time}초 대기 후)" if backoff_time > 0 else ""))
+                    
+                    # 첫 시도가 아니면 Exponential Backoff 대기
+                    if backoff_time > 0:
+                        time.sleep(backoff_time)
+                    
+                    # 120초 타임아웃으로 요청 (모델 로딩 시간 고려)
+                    response = requests.post(api_url, headers=headers, data=audio_bytes, timeout=120)
+                    
+                    # 응답 상태 코드 확인
+                    print(f"응답 상태 코드: {response.status_code}")
+                    
+                    # 503 Service Unavailable - 모델 로딩 중 또는 서버 과부하
+                    if response.status_code == 503:
+                        print(f"[WARN] 서버 과부하 (503 Service Unavailable)")
+                        print(f"응답 내용: {response.content[:200]}")
+                        if attempt < max_retries:
+                            print(f"Exponential Backoff: {2 ** (attempt - 1)}초 후 재시도...")
+                            continue  # 위에서 계산된 다음 Backoff 시간 적용
+                        break
+                    
+                    # 기타 HTTP 오류
+                    elif response.status_code != 200:
+                        print(f"[WARN] HTTP 오류: {response.status_code}")
+                        print(f"응답 내용: {response.content[:200]}")
+                        if attempt < max_retries:
+                            print(f"{backoff_time}초 후 재시도...")
+                            continue
+                        break
+                    
+                    # JSON 응답 파싱 시도
+                    try:
+                        result = response.json()
+                        
+                        # 결과 확인
+                        if isinstance(result, dict) and "text" in result:
+                            return result["text"]
+                        
+                        # text 키가 없는 경우 다른 형식 확인
+                        if isinstance(result, list) and len(result) > 0:
+                            if "generated_text" in result[0]:
+                                return result[0]["generated_text"]
+                            elif "text" in result[0]:
+                                return result[0]["text"]
+                        
+                        # 다른 결과 형식을 처리하기 위한 로그
+                        print(f"알 수 없는 응답 구조: {result}")
+                        
+                    except json.JSONDecodeError:
+                        # JSON 디코딩 오류 (빈 응답이나 HTML이 온 경우)
+                        print(f"[WARN] JSON 파싱 실패: HTTP {response.status_code}, 내용 길이={len(response.content)}")
+                        print(f"응답 내용 일부: {response.content[:200]}")
+                
+                except requests.exceptions.RequestException as e:
+                    print(f"[WARN] 요청 실패: {e}")
+                
+                # 마지막 시도가 아니고 다른 오류인 경우 짧은 대기 후 재시도
+                if attempt < max_retries and response.status_code != 503:
+                    print(f"1초 후 재시도...")
+                    time.sleep(1)
+            
+            print("설정된 모델로 모든 시도 실패")
+            
+            # 두 번째 시도: 다른 한국어 모델을 하나씩 시도
+            for korean_model in self.korean_models:
+                if korean_model == model_to_use:
+                    continue  # 이미 시도한 모델은 건너뜀
+                
+                print(f"[INFO] 대체 모델 {korean_model}로 시도")
+                
+                # 대체 모델 API URL
+                alt_api_url = f"https://api-inference.huggingface.co/models/{korean_model}?wait_for_model=true"
+                
+                for attempt in range(1, max_retries + 1):
+                    try:
+                        # Exponential Backoff 대기 시간 계산 (첫 시도는 0초)
+                        backoff_time = 0 if attempt == 1 else 2 ** (attempt - 2)
+                        
+                        print(f"[INFO] 대체 모델 시도 {attempt}/{max_retries}: {korean_model}" + 
+                              (f" (Backoff: {backoff_time}초 대기 후)" if backoff_time > 0 else ""))
+                        
+                        # 첫 시도가 아니면 Exponential Backoff 대기
+                        if backoff_time > 0:
+                            time.sleep(backoff_time)
+                        
+                        # 120초 타임아웃으로 요청
+                        response = requests.post(alt_api_url, headers=headers, data=audio_bytes, timeout=120)
+                        
+                        # 응답 상태 코드 확인
+                        print(f"응답 상태 코드: {response.status_code}")
+                        
+                        # 503 Service Unavailable - 모델 로딩 중 또는 서버 과부하
+                        if response.status_code == 503:
+                            print(f"[WARN] 서버 과부하 (503 Service Unavailable)")
+                            print(f"응답 내용: {response.content[:200]}")
+                            if attempt < max_retries:
+                                print(f"Exponential Backoff: {2 ** (attempt - 1)}초 후 재시도...")
+                                continue  # 위에서 계산된 다음 Backoff 시간 적용
+                            break
+                        
+                        # 기타 HTTP 오류
+                        elif response.status_code != 200:
+                            print(f"[WARN] HTTP 오류: {response.status_code}")
+                            print(f"응답 내용: {response.content[:200]}")
+                            if attempt < max_retries:
+                                print(f"{backoff_time}초 후 재시도...")
+                                continue
+                            break
+                        
+                        # JSON 응답 파싱 시도
+                        try:
+                            result = response.json()
+                            
+                            # 결과 확인
+                            if isinstance(result, dict) and "text" in result:
+                                return result["text"]
+                            
+                            if isinstance(result, list) and len(result) > 0:
+                                if "generated_text" in result[0]:
+                                    return result[0]["generated_text"]
+                                elif "text" in result[0]:
+                                    return result[0]["text"]
+                        
+                        except json.JSONDecodeError:
+                            print(f"[WARN] 대체 모델 JSON 파싱 실패: HTTP {response.status_code}")
+                            print(f"응답 내용 일부: {response.content[:200]}")
+                        
+                    except requests.exceptions.RequestException as e:
+                        print(f"[WARN] 대체 모델 요청 실패: {e}")
+                    
+                    # 마지막 시도가 아니고 다른 오류인 경우 짧은 대기 후 재시도
+                    if attempt < max_retries and response.status_code != 503:
+                        print(f"1초 후 재시도...")
+                        time.sleep(1)
+            
+            print("모든 모델 시도 실패")
+            return None
+            
+        except Exception as e:
+            print(f"Hugging Face API 호출 오류: {e}")
+            return None
+    
+    def _recognize_with_vosk(self, file_path: str) -> Optional[str]:
+        """VOSK 오프라인 모델로 음성 인식을 수행합니다."""
+        try:
+            wf = wave.open(file_path, "rb")
+        except Exception as e:
+            print(f"VOSK 오디오 파일 열기 오류: {e}")
+            return None
+        if not self.vosk_model:
+            print("VOSK 모델이 로드되지 않았습니다.")
+            return None
+        rec = KaldiRecognizer(self.vosk_model, wf.getframerate())
+        text = ""
+        while True:
+            data = wf.readframes(4000)
+            if len(data) == 0:
+                break
+            if rec.AcceptWaveform(data):
+                res = json.loads(rec.Result())
+                text += res.get("text", "")
+        final_res = json.loads(rec.FinalResult())
+        text += final_res.get("text", "")
+        wf.close()
+        return text if text else None
+    
+    def _recognize_with_openai(self, file_path: str) -> Optional[str]:
+        """OpenAI Whisper API를 사용하여 음성 인식을 수행합니다. (백업 방식)"""
+        try:
+            with open(file_path, "rb") as audio_file:
+                try:
+                    client = openai.OpenAI(api_key=self.api_key)
+                    result = client.audio.transcriptions.create(
+                    model="whisper-1",  # 기본 모델
+                        file=audio_file,
+                        language="ko",
+                        response_format="text"
+                    )
+                    text_result = result if isinstance(result, str) else result.text
+                    return text_result
+                except Exception as api_error:
+                    print(f"OpenAI API 호출 오류: {api_error}")
+                    return None
+        except Exception as e:
+            print(f"OpenAI API 파일 로드 오류: {e}")
+            return None
            
    def recognize_file(self, file_path: str) -> Optional[str]:
        """오디오 파일을 텍스트로 변환합니다."""
@ -115,54 +348,22 @@ class SpeechRecognizer:
            return None
            
        try:
-            # OpenAI Whisper API를 사용하여 음성 인식
-            with open(file_path, "rb") as audio_file:
-                try:
-                    if not self.api_key:
-                        print("API 키가 설정되지 않았습니다.")
-                        return None
-                        
-                    # 최신 OpenAI API 사용 방식
-                    try:
-                        client = openai.OpenAI(api_key=self.api_key)
-                        result = client.audio.transcriptions.create(
-                            model="whisper-1",
-                            file=audio_file,
-                            language="ko",
-                            response_format="text"
-                        )
-                        text_result = result if isinstance(result, str) else result.text
-                    except (AttributeError, ImportError, NameError):
-                        # 이전 버전 OpenAI 라이브러리 지원 (최대한 호환성 유지)
-                        try:
-                            # audio 모듈 사용 (최신 버전)
-                            result = openai.audio.transcriptions.create(
-                                model="whisper-1",
-                                file=audio_file,
-                                language="ko",
-                                response_format="text"
-                            )
-                            text_result = result if isinstance(result, str) else result.text
-                        except (AttributeError, ImportError, NameError):
-                            # Audio 클래스 사용 (이전 버전)
-                            try:
-                                result = openai.Audio.transcribe(
-                                    model="whisper-1",
-                                    file=audio_file,
-                                    language="ko",
-                                    response_format="text"
-                                )
-                                text_result = result if isinstance(result, str) else result.get("text", "")
-                            except:
-                                print("OpenAI API 호출 방식을 찾을 수 없습니다.")
-                                return None
-                    
-                    return text_result
-                    
-                except Exception as e:
-                    print(f"파일 API 호출 오류: {e}")
+            if self.model_provider.lower() == "vosk":
+                # VOSK 오프라인 모델 사용
+                if self.vosk_model is None:
+                    print("VOSK 모델이 로드되지 않았습니다.")
                    return None
-                    
+                return self._recognize_with_vosk(file_path)
+            elif self.model_provider.lower() == "huggingface" and self.hf_api_key:
+                # Hugging Face 모델을 사용하여 음성 인식
+                return self._recognize_with_huggingface(file_path)
+            elif self.api_key:
+                # OpenAI Whisper API를 백업으로 사용
+                return self._recognize_with_openai(file_path)
+            else:
+                print("사용 가능한 API 키가 없습니다.")
+                return None
+                        
        except Exception as e:
            print(f"파일 음성 인식 오류: {e}")
            return None
@ -230,5 +431,35 @@ class SpeechRecognizer:
        try:
            with open("config.ini", "w") as config_file:
                self.config.write(config_file)
+        except Exception as e:
+            print(f"설정 파일 저장 오류: {e}")
+            
+    def set_huggingface_api_key(self, api_key: str):
+        """Hugging Face API 키를 설정합니다."""
+        self.hf_api_key = api_key
+        
+        # config.ini 파일 업데이트
+        self.config.set("api", "huggingface_api_key", api_key)
+        try:
+            with open("config.ini", "w") as config_file:
+                self.config.write(config_file)
+        except Exception as e:
+            print(f"설정 파일 저장 오류: {e}")
+            
+    def set_model(self, provider: str, model_name: str):
+        """음성 인식 모델을 설정합니다."""
+        self.model_provider = provider
+        self.model_name = model_name
+        
+        # config.ini 파일 업데이트
+        if not self.config.has_section("model"):
+            self.config.add_section("model")
+        
+        self.config.set("model", "provider", provider)
+        self.config.set("model", "name", model_name)
+        
+        try:
+            with open("config.ini", "w") as config_file:
+                self.config.write(config_file) 
        except Exception as e:
            print(f"설정 파일 저장 오류: {e}") 
--- a/poetry.lock
+++ b/poetry.lock
@ -115,6 +115,108 @@ files = [
 [package.dependencies]
 pycparser = "*"

+[[package]]
+name = "charset-normalizer"
+version = "3.4.2"
+description = "The Real First Universal Charset Detector. Open, modern and actively maintained alternative to Chardet."
+optional = false
+python-versions = ">=3.7"
+groups = ["main"]
+files = [
+    {file = "charset_normalizer-3.4.2-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:7c48ed483eb946e6c04ccbe02c6b4d1d48e51944b6db70f697e089c193404941"},
+    {file = "charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2d318c11350e10662026ad0eb71bb51c7812fc8590825304ae0bdd4ac283acd"},
+    {file = "charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:9cbfacf36cb0ec2897ce0ebc5d08ca44213af24265bd56eca54bee7923c48fd6"},
+    {file = "charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:18dd2e350387c87dabe711b86f83c9c78af772c748904d372ade190b5c7c9d4d"},
+    {file = "charset_normalizer-3.4.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:8075c35cd58273fee266c58c0c9b670947c19df5fb98e7b66710e04ad4e9ff86"},
+    {file = "charset_normalizer-3.4.2-cp310-cp310-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:5bf4545e3b962767e5c06fe1738f951f77d27967cb2caa64c28be7c4563e162c"},
+    {file = "charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:7a6ab32f7210554a96cd9e33abe3ddd86732beeafc7a28e9955cdf22ffadbab0"},
+    {file = "charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_i686.whl", hash = "sha256:b33de11b92e9f75a2b545d6e9b6f37e398d86c3e9e9653c4864eb7e89c5773ef"},
+    {file = "charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_ppc64le.whl", hash = "sha256:8755483f3c00d6c9a77f490c17e6ab0c8729e39e6390328e42521ef175380ae6"},
+    {file = "charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_s390x.whl", hash = "sha256:68a328e5f55ec37c57f19ebb1fdc56a248db2e3e9ad769919a58672958e8f366"},
+    {file = "charset_normalizer-3.4.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:21b2899062867b0e1fde9b724f8aecb1af14f2778d69aacd1a5a1853a597a5db"},
+    {file = "charset_normalizer-3.4.2-cp310-cp310-win32.whl", hash = "sha256:e8082b26888e2f8b36a042a58307d5b917ef2b1cacab921ad3323ef91901c71a"},
+    {file = "charset_normalizer-3.4.2-cp310-cp310-win_amd64.whl", hash = "sha256:f69a27e45c43520f5487f27627059b64aaf160415589230992cec34c5e18a509"},
+    {file = "charset_normalizer-3.4.2-cp311-cp311-macosx_10_9_universal2.whl", hash = "sha256:be1e352acbe3c78727a16a455126d9ff83ea2dfdcbc83148d2982305a04714c2"},
+    {file = "charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:aa88ca0b1932e93f2d961bf3addbb2db902198dca337d88c89e1559e066e7645"},
+    {file = "charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:d524ba3f1581b35c03cb42beebab4a13e6cdad7b36246bd22541fa585a56cccd"},
+    {file = "charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:28a1005facc94196e1fb3e82a3d442a9d9110b8434fc1ded7a24a2983c9888d8"},
+    {file = "charset_normalizer-3.4.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fdb20a30fe1175ecabed17cbf7812f7b804b8a315a25f24678bcdf120a90077f"},
+    {file = "charset_normalizer-3.4.2-cp311-cp311-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:0f5d9ed7f254402c9e7d35d2f5972c9bbea9040e99cd2861bd77dc68263277c7"},
+    {file = "charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:efd387a49825780ff861998cd959767800d54f8308936b21025326de4b5a42b9"},
+    {file = "charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_i686.whl", hash = "sha256:f0aa37f3c979cf2546b73e8222bbfa3dc07a641585340179d768068e3455e544"},
+    {file = "charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_ppc64le.whl", hash = "sha256:e70e990b2137b29dc5564715de1e12701815dacc1d056308e2b17e9095372a82"},
+    {file = "charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_s390x.whl", hash = "sha256:0c8c57f84ccfc871a48a47321cfa49ae1df56cd1d965a09abe84066f6853b9c0"},
+    {file = "charset_normalizer-3.4.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:6b66f92b17849b85cad91259efc341dce9c1af48e2173bf38a85c6329f1033e5"},
+    {file = "charset_normalizer-3.4.2-cp311-cp311-win32.whl", hash = "sha256:daac4765328a919a805fa5e2720f3e94767abd632ae410a9062dff5412bae65a"},
+    {file = "charset_normalizer-3.4.2-cp311-cp311-win_amd64.whl", hash = "sha256:e53efc7c7cee4c1e70661e2e112ca46a575f90ed9ae3fef200f2a25e954f4b28"},
+    {file = "charset_normalizer-3.4.2-cp312-cp312-macosx_10_13_universal2.whl", hash = "sha256:0c29de6a1a95f24b9a1aa7aefd27d2487263f00dfd55a77719b530788f75cff7"},
+    {file = "charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cddf7bd982eaa998934a91f69d182aec997c6c468898efe6679af88283b498d3"},
+    {file = "charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:fcbe676a55d7445b22c10967bceaaf0ee69407fbe0ece4d032b6eb8d4565982a"},
+    {file = "charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:d41c4d287cfc69060fa91cae9683eacffad989f1a10811995fa309df656ec214"},
+    {file = "charset_normalizer-3.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4e594135de17ab3866138f496755f302b72157d115086d100c3f19370839dd3a"},
+    {file = "charset_normalizer-3.4.2-cp312-cp312-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:cf713fe9a71ef6fd5adf7a79670135081cd4431c2943864757f0fa3a65b1fafd"},
+    {file = "charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:a370b3e078e418187da8c3674eddb9d983ec09445c99a3a263c2011993522981"},
+    {file = "charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_i686.whl", hash = "sha256:a955b438e62efdf7e0b7b52a64dc5c3396e2634baa62471768a64bc2adb73d5c"},
+    {file = "charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_ppc64le.whl", hash = "sha256:7222ffd5e4de8e57e03ce2cef95a4c43c98fcb72ad86909abdfc2c17d227fc1b"},
+    {file = "charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_s390x.whl", hash = "sha256:bee093bf902e1d8fc0ac143c88902c3dfc8941f7ea1d6a8dd2bcb786d33db03d"},
+    {file = "charset_normalizer-3.4.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:dedb8adb91d11846ee08bec4c8236c8549ac721c245678282dcb06b221aab59f"},
+    {file = "charset_normalizer-3.4.2-cp312-cp312-win32.whl", hash = "sha256:db4c7bf0e07fc3b7d89ac2a5880a6a8062056801b83ff56d8464b70f65482b6c"},
+    {file = "charset_normalizer-3.4.2-cp312-cp312-win_amd64.whl", hash = "sha256:5a9979887252a82fefd3d3ed2a8e3b937a7a809f65dcb1e068b090e165bbe99e"},
+    {file = "charset_normalizer-3.4.2-cp313-cp313-macosx_10_13_universal2.whl", hash = "sha256:926ca93accd5d36ccdabd803392ddc3e03e6d4cd1cf17deff3b989ab8e9dbcf0"},
+    {file = "charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:eba9904b0f38a143592d9fc0e19e2df0fa2e41c3c3745554761c5f6447eedabf"},
+    {file = "charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:3fddb7e2c84ac87ac3a947cb4e66d143ca5863ef48e4a5ecb83bd48619e4634e"},
+    {file = "charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:98f862da73774290f251b9df8d11161b6cf25b599a66baf087c1ffe340e9bfd1"},
+    {file = "charset_normalizer-3.4.2-cp313-cp313-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6c9379d65defcab82d07b2a9dfbfc2e95bc8fe0ebb1b176a3190230a3ef0e07c"},
+    {file = "charset_normalizer-3.4.2-cp313-cp313-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e635b87f01ebc977342e2697d05b56632f5f879a4f15955dfe8cef2448b51691"},
+    {file = "charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:1c95a1e2902a8b722868587c0e1184ad5c55631de5afc0eb96bc4b0d738092c0"},
+    {file = "charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_i686.whl", hash = "sha256:ef8de666d6179b009dce7bcb2ad4c4a779f113f12caf8dc77f0162c29d20490b"},
+    {file = "charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_ppc64le.whl", hash = "sha256:32fc0341d72e0f73f80acb0a2c94216bd704f4f0bce10aedea38f30502b271ff"},
+    {file = "charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_s390x.whl", hash = "sha256:289200a18fa698949d2b39c671c2cc7a24d44096784e76614899a7ccf2574b7b"},
+    {file = "charset_normalizer-3.4.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:4a476b06fbcf359ad25d34a057b7219281286ae2477cc5ff5e3f70a246971148"},
+    {file = "charset_normalizer-3.4.2-cp313-cp313-win32.whl", hash = "sha256:aaeeb6a479c7667fbe1099af9617c83aaca22182d6cf8c53966491a0f1b7ffb7"},
+    {file = "charset_normalizer-3.4.2-cp313-cp313-win_amd64.whl", hash = "sha256:aa6af9e7d59f9c12b33ae4e9450619cf2488e2bbe9b44030905877f0b2324980"},
+    {file = "charset_normalizer-3.4.2-cp37-cp37m-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1cad5f45b3146325bb38d6855642f6fd609c3f7cad4dbaf75549bf3b904d3184"},
+    {file = "charset_normalizer-3.4.2-cp37-cp37m-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:b2680962a4848b3c4f155dc2ee64505a9c57186d0d56b43123b17ca3de18f0fa"},
+    {file = "charset_normalizer-3.4.2-cp37-cp37m-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:36b31da18b8890a76ec181c3cf44326bf2c48e36d393ca1b72b3f484113ea344"},
+    {file = "charset_normalizer-3.4.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:f4074c5a429281bf056ddd4c5d3b740ebca4d43ffffe2ef4bf4d2d05114299da"},
+    {file = "charset_normalizer-3.4.2-cp37-cp37m-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:c9e36a97bee9b86ef9a1cf7bb96747eb7a15c2f22bdb5b516434b00f2a599f02"},
+    {file = "charset_normalizer-3.4.2-cp37-cp37m-musllinux_1_2_aarch64.whl", hash = "sha256:1b1bde144d98e446b056ef98e59c256e9294f6b74d7af6846bf5ffdafd687a7d"},
+    {file = "charset_normalizer-3.4.2-cp37-cp37m-musllinux_1_2_i686.whl", hash = "sha256:915f3849a011c1f593ab99092f3cecfcb4d65d8feb4a64cf1bf2d22074dc0ec4"},
+    {file = "charset_normalizer-3.4.2-cp37-cp37m-musllinux_1_2_ppc64le.whl", hash = "sha256:fb707f3e15060adf5b7ada797624a6c6e0138e2a26baa089df64c68ee98e040f"},
+    {file = "charset_normalizer-3.4.2-cp37-cp37m-musllinux_1_2_s390x.whl", hash = "sha256:25a23ea5c7edc53e0f29bae2c44fcb5a1aa10591aae107f2a2b2583a9c5cbc64"},
+    {file = "charset_normalizer-3.4.2-cp37-cp37m-musllinux_1_2_x86_64.whl", hash = "sha256:770cab594ecf99ae64c236bc9ee3439c3f46be49796e265ce0cc8bc17b10294f"},
+    {file = "charset_normalizer-3.4.2-cp37-cp37m-win32.whl", hash = "sha256:6a0289e4589e8bdfef02a80478f1dfcb14f0ab696b5a00e1f4b8a14a307a3c58"},
+    {file = "charset_normalizer-3.4.2-cp37-cp37m-win_amd64.whl", hash = "sha256:6fc1f5b51fa4cecaa18f2bd7a003f3dd039dd615cd69a2afd6d3b19aed6775f2"},
+    {file = "charset_normalizer-3.4.2-cp38-cp38-macosx_10_9_universal2.whl", hash = "sha256:76af085e67e56c8816c3ccf256ebd136def2ed9654525348cfa744b6802b69eb"},
+    {file = "charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e45ba65510e2647721e35323d6ef54c7974959f6081b58d4ef5d87c60c84919a"},
+    {file = "charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:046595208aae0120559a67693ecc65dd75d46f7bf687f159127046628178dc45"},
+    {file = "charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:75d10d37a47afee94919c4fab4c22b9bc2a8bf7d4f46f87363bcf0573f3ff4f5"},
+    {file = "charset_normalizer-3.4.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:6333b3aa5a12c26b2a4d4e7335a28f1475e0e5e17d69d55141ee3cab736f66d1"},
+    {file = "charset_normalizer-3.4.2-cp38-cp38-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:e8323a9b031aa0393768b87f04b4164a40037fb2a3c11ac06a03ffecd3618027"},
+    {file = "charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_aarch64.whl", hash = "sha256:24498ba8ed6c2e0b56d4acbf83f2d989720a93b41d712ebd4f4979660db4417b"},
+    {file = "charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_i686.whl", hash = "sha256:844da2b5728b5ce0e32d863af26f32b5ce61bc4273a9c720a9f3aa9df73b1455"},
+    {file = "charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_ppc64le.whl", hash = "sha256:65c981bdbd3f57670af8b59777cbfae75364b483fa8a9f420f08094531d54a01"},
+    {file = "charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_s390x.whl", hash = "sha256:3c21d4fca343c805a52c0c78edc01e3477f6dd1ad7c47653241cf2a206d4fc58"},
+    {file = "charset_normalizer-3.4.2-cp38-cp38-musllinux_1_2_x86_64.whl", hash = "sha256:dc7039885fa1baf9be153a0626e337aa7ec8bf96b0128605fb0d77788ddc1681"},
+    {file = "charset_normalizer-3.4.2-cp38-cp38-win32.whl", hash = "sha256:8272b73e1c5603666618805fe821edba66892e2870058c94c53147602eab29c7"},
+    {file = "charset_normalizer-3.4.2-cp38-cp38-win_amd64.whl", hash = "sha256:70f7172939fdf8790425ba31915bfbe8335030f05b9913d7ae00a87d4395620a"},
+    {file = "charset_normalizer-3.4.2-cp39-cp39-macosx_10_9_universal2.whl", hash = "sha256:005fa3432484527f9732ebd315da8da8001593e2cf46a3d817669f062c3d9ed4"},
+    {file = "charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:e92fca20c46e9f5e1bb485887d074918b13543b1c2a1185e69bb8d17ab6236a7"},
+    {file = "charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:50bf98d5e563b83cc29471fa114366e6806bc06bc7a25fd59641e41445327836"},
+    {file = "charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:721c76e84fe669be19c5791da68232ca2e05ba5185575086e384352e2c309597"},
+    {file = "charset_normalizer-3.4.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:82d8fd25b7f4675d0c47cf95b594d4e7b158aca33b76aa63d07186e13c0e0ab7"},
+    {file = "charset_normalizer-3.4.2-cp39-cp39-manylinux_2_5_i686.manylinux1_i686.manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:b3daeac64d5b371dea99714f08ffc2c208522ec6b06fbc7866a450dd446f5c0f"},
+    {file = "charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:dccab8d5fa1ef9bfba0590ecf4d46df048d18ffe3eec01eeb73a42e0d9e7a8ba"},
+    {file = "charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_i686.whl", hash = "sha256:aaf27faa992bfee0264dc1f03f4c75e9fcdda66a519db6b957a3f826e285cf12"},
+    {file = "charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_ppc64le.whl", hash = "sha256:eb30abc20df9ab0814b5a2524f23d75dcf83cde762c161917a2b4b7b55b1e518"},
+    {file = "charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_s390x.whl", hash = "sha256:c72fbbe68c6f32f251bdc08b8611c7b3060612236e960ef848e0a517ddbe76c5"},
+    {file = "charset_normalizer-3.4.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:982bb1e8b4ffda883b3d0a521e23abcd6fd17418f6d2c4118d257a10199c0ce3"},
+    {file = "charset_normalizer-3.4.2-cp39-cp39-win32.whl", hash = "sha256:43e0933a0eff183ee85833f341ec567c0980dae57c464d8a508e1b2ceb336471"},
+    {file = "charset_normalizer-3.4.2-cp39-cp39-win_amd64.whl", hash = "sha256:d11b54acf878eef558599658b0ffca78138c8c3655cf4f3a4a673c437e67732e"},
+    {file = "charset_normalizer-3.4.2-py3-none-any.whl", hash = "sha256:7f56930ab0abd1c45cd15be65cc741c28b1c9a34876ce8c17a2fa107810c0af0"},
+    {file = "charset_normalizer-3.4.2.tar.gz", hash = "sha256:5baececa9ecba31eff645232d59845c07aa030f0c81ee70184a90d35099a0e63"},
+]
+
 [[package]]
 name = "colorama"
 version = "0.4.6"
@ -1050,6 +1152,28 @@ files = [
 [package.dependencies]
 six = ">=1.9.0"

+[[package]]
+name = "requests"
+version = "2.32.3"
+description = "Python HTTP for Humans."
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "requests-2.32.3-py3-none-any.whl", hash = "sha256:70761cfe03c773ceb22aa2f671b4757976145175cdfca038c02654d061d6dcc6"},
+    {file = "requests-2.32.3.tar.gz", hash = "sha256:55365417734eb18255590a9ff9eb97e9e1da868d4ccd6402399eaf68af20a760"},
+]
+
+[package.dependencies]
+certifi = ">=2017.4.17"
+charset-normalizer = ">=2,<4"
+idna = ">=2.5,<4"
+urllib3 = ">=1.21.1,<3"
+
+[package.extras]
+socks = ["PySocks (>=1.5.6,!=1.5.7)"]
+use-chardet-on-py3 = ["chardet (>=3.0.2,<6)"]
+
 [[package]]
 name = "six"
 version = "1.17.0"
@ -1129,6 +1253,24 @@ files = [
    {file = "typing_extensions-4.13.2.tar.gz", hash = "sha256:e6c81219bd689f51865d9e372991c540bda33a0379d5573cddb9a3a23f7caaef"},
 ]

+[[package]]
+name = "urllib3"
+version = "2.2.3"
+description = "HTTP library with thread-safe connection pooling, file post, and more."
+optional = false
+python-versions = ">=3.8"
+groups = ["main"]
+files = [
+    {file = "urllib3-2.2.3-py3-none-any.whl", hash = "sha256:ca899ca043dcb1bafa3e262d73aa25c465bfb49e0bd9dd5d59f1d0acba2f8fac"},
+    {file = "urllib3-2.2.3.tar.gz", hash = "sha256:e7d814a81dad81e6caf2ec9fdedb284ecc9c73076b62654547cc64ccdcae26e9"},
+]
+
+[package.extras]
+brotli = ["brotli (>=1.0.9) ; platform_python_implementation == \"CPython\"", "brotlicffi (>=0.8.0) ; platform_python_implementation != \"CPython\""]
+h2 = ["h2 (>=4,<5)"]
+socks = ["pysocks (>=1.5.6,!=1.5.7,<2.0)"]
+zstd = ["zstandard (>=0.18.0)"]
+
 [[package]]
 name = "watchdog"
 version = "3.0.0"
@ -1302,4 +1444,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.1"
 python-versions = "^3.8"
-content-hash = "45cc5a82edeb1d93670ba5ee2e56925e68d129abeca2481bf890f70a48ed5ed2"
+content-hash = "675cc981f2dac0ff7f944f56b41285406ddbbf696f36627c85e61dbd88cd8d83"
--- a/pyproject.toml
+++ b/pyproject.toml
@ -13,6 +13,11 @@ openai = "^1.0.0"
 numpy = "^1.22.0"
 sounddevice = "^0.4.6"
 pyaudio = "^0.2.13"
+requests = "^2.32.3"
+librosa = "^0.11.0"
+wave = "^0.0.2"
+cffi = "^1.17.1"
+scipy = "^1.15.2"

 [build-system]
 requires = ["poetry-core"]