diff --git a/android/app/build.gradle.kts b/android/app/build.gradle.kts index 0c81a39..539e668 100644 --- a/android/app/build.gradle.kts +++ b/android/app/build.gradle.kts @@ -36,7 +36,6 @@ android { } dependencies { - implementation("com.aallam.openai:openai-client:3.5.1") implementation("io.ktor:ktor-client-okhttp:2.3.6") implementation("androidx.core:core-ktx:1.9.0") implementation("androidx.appcompat:appcompat:1.6.1") diff --git a/android/app/src/main/AndroidManifest.xml b/android/app/src/main/AndroidManifest.xml index 0924a66..a97254c 100644 --- a/android/app/src/main/AndroidManifest.xml +++ b/android/app/src/main/AndroidManifest.xml @@ -12,6 +12,7 @@ android:roundIcon="@mipmap/ic_launcher_round" android:supportsRtl="true" android:theme="@style/Theme.WhisperToInput" + android:usesCleartextTraffic="true" tools:targetApi="31"> by preferencesDataStore(name = "settings") +val ENDPOINT = stringPreferencesKey("endpoint") +val LANGUAGE_CODE = stringPreferencesKey("language-code") +val REQUEST_STYLE = booleanPreferencesKey("is-openai-api-request-style") val API_KEY = stringPreferencesKey("api-key") class MainActivity : AppCompatActivity() { override fun onCreate(savedInstanceState: Bundle?) { super.onCreate(savedInstanceState) setContentView(R.layout.activity_main) - setupApiKeyWidgets(this) + setupConfigWidgets(this) checkPermissions() } @@ -54,53 +58,120 @@ class MainActivity : AppCompatActivity() { startActivity(intent) } - // Sets up API Key-related widgets. - private fun setupApiKeyWidgets(context: Context) { + // Sets up config widgets. + private fun setupConfigWidgets(context: Context) { + // TODO: Refactor. Perhaps use a class to process configuration UI widgets and behaviors. // Launches a non-blocking job in the main thread. // Perform data retrieval in the IO thread. + val endpointInput: EditText = findViewById(R.id.edittext_endpoint) + val btnSetEndpoint: Button = findViewById(R.id.btn_set_endpoint) + val languageCodeInput: EditText = findViewById(R.id.edittext_language_code) + val btnSetLanguageCode: Button = findViewById(R.id.btn_set_language_code) val apiKeyInput: EditText = findViewById(R.id.edittext_api_key) val btnSetApiKey: Button = findViewById(R.id.btn_set_api_key) + val requestStyleOption : RadioGroup = findViewById(R.id.radio_request_style) CoroutineScope(Dispatchers.Main).launch { - // Disable input & button, and show loading hint - apiKeyInput.isEnabled = false + // Disable inputs, buttons & controls, and show loading hint + endpointInput.isEnabled = false + endpointInput.hint = getString(R.string.loading) + btnSetEndpoint.isEnabled = false + languageCodeInput.isEnabled = false + languageCodeInput.hint = getString(R.string.loading) + btnSetLanguageCode.isEnabled = false apiKeyInput.hint = getString(R.string.loading) btnSetApiKey.isEnabled = false + requestStyleOption.isEnabled = false + + // Retrieve stored endpoint, language code, api key & request style + val retrievedEndpoint = withContext(Dispatchers.IO) { + return@withContext dataStore.data.map { preferences -> + preferences[ENDPOINT] + }.first() + } + + val retrievedLanguageCode = withContext(Dispatchers.IO) { + return@withContext dataStore.data.map { preferences -> + preferences[LANGUAGE_CODE] + }.first() + } + + val retrievedRequestStyle = withContext(Dispatchers.IO) { + return@withContext dataStore.data.map { preferences -> + preferences[REQUEST_STYLE] + }.first() + } - // Retrieve Api Key val retrievedApiKey = withContext(Dispatchers.IO) { return@withContext dataStore.data.map { preferences -> preferences[API_KEY] }.first() } - // Set retrieved api key in input, or set "Enter API Key" hint + // Set retrieved endpoint in input, or set hint + if (retrievedEndpoint.isNullOrEmpty()) { + endpointInput.hint = getString(R.string.endpoint_hint) + } else { + endpointInput.setText(retrievedEndpoint) + } + + // Set retrieved endpoint input, or set hint + // TODO: This could a dropdown list? Or radio group? + if (retrievedLanguageCode.isNullOrEmpty()) { + languageCodeInput.hint = getString(R.string.language_code_hint) + } else { + languageCodeInput.setText(retrievedLanguageCode) + } + + // Set retrieved request style, or assign a default + if (retrievedRequestStyle == null) { + dataStore.edit { settings -> + settings[REQUEST_STYLE] = true + } + requestStyleOption.check(R.id.radio_btn_openai_api) + } else if (retrievedRequestStyle) { + requestStyleOption.check(R.id.radio_btn_openai_api) + } else { + requestStyleOption.check(R.id.radio_btn_whisper_webservice) + } + + // Set retrieved api key if (retrievedApiKey.isNullOrEmpty()) { - apiKeyInput.hint = getString(R.string.enter_openai_api_key) + apiKeyInput.hint = getString(R.string.api_key_hint) } else { apiKeyInput.setText(retrievedApiKey) } // Re-enable input & button + endpointInput.isEnabled = true + btnSetEndpoint.isEnabled = true + languageCodeInput.isEnabled = true + btnSetLanguageCode.isEnabled = true apiKeyInput.isEnabled = true btnSetApiKey.isEnabled = true + requestStyleOption.isEnabled = true - // After retrieval is done, assign onClick event to the setApiKey button - btnSetApiKey.setOnClickListener { onSetApiKey(context, apiKeyInput.text.toString()) } + // After retrieval is done, assign onClick event to the set buttons + btnSetEndpoint.setOnClickListener { onSetConfig(context, ENDPOINT, endpointInput.text.toString()) } + btnSetLanguageCode.setOnClickListener { onSetConfig(context, LANGUAGE_CODE, languageCodeInput.text.toString()) } + requestStyleOption.setOnCheckedChangeListener { _, checkedId -> + onSetConfig(context, REQUEST_STYLE, (checkedId == R.id.radio_btn_openai_api)) + } + btnSetApiKey.setOnClickListener { onSetConfig(context, API_KEY, apiKeyInput.text.toString()) } } } - // The onClick event of the button set api key - private fun onSetApiKey(context: Context, newApiKey: String?) { + // The onClick event of set config buttons + private fun onSetConfig(context: Context, key: Preferences.Key, newValue: T) { CoroutineScope(Dispatchers.Main).launch { withContext(Dispatchers.IO) { dataStore.edit { settings -> - settings[API_KEY] = newApiKey ?: "" + settings[key] = newValue } } - Toast.makeText(context, getString(R.string.api_key_successfully_set), Toast.LENGTH_SHORT).show() + Toast.makeText(context, getString(R.string.successfully_set), Toast.LENGTH_SHORT).show() } } diff --git a/android/app/src/main/java/com/example/whispertoinput/WhisperInputService.kt b/android/app/src/main/java/com/example/whispertoinput/WhisperInputService.kt index 71645af..78ae599 100644 --- a/android/app/src/main/java/com/example/whispertoinput/WhisperInputService.kt +++ b/android/app/src/main/java/com/example/whispertoinput/WhisperInputService.kt @@ -14,7 +14,7 @@ import android.content.pm.PackageManager import android.widget.Toast private const val RECORDED_AUDIO_FILENAME = "recorded.m4a" - +private const val AUDIO_MEDIA_TYPE = "audio/mp4" class WhisperInputService : InputMethodService() { private var whisperKeyboard: WhisperKeyboard = WhisperKeyboard() private var whisperJobManager: WhisperTranscriber = WhisperTranscriber() @@ -23,7 +23,7 @@ class WhisperInputService : InputMethodService() { private fun transcriptionCallback(text: String?) { if (!text.isNullOrEmpty()) { - currentInputConnection?.commitText(text, text.length) + currentInputConnection?.commitText(text, 1) } whisperKeyboard.reset() @@ -67,6 +67,7 @@ class WhisperInputService : InputMethodService() { whisperJobManager.startAsync( this, recordedAudioFilename, + AUDIO_MEDIA_TYPE, { transcriptionCallback(it) }, { transcriptionExceptionCallback(it) } ) diff --git a/android/app/src/main/java/com/example/whispertoinput/WhisperTranscriber.kt b/android/app/src/main/java/com/example/whispertoinput/WhisperTranscriber.kt index 5316d61..ca1e667 100644 --- a/android/app/src/main/java/com/example/whispertoinput/WhisperTranscriber.kt +++ b/android/app/src/main/java/com/example/whispertoinput/WhisperTranscriber.kt @@ -1,44 +1,63 @@ package com.example.whispertoinput import android.content.Context -import android.util.Log -import com.aallam.openai.api.audio.TranscriptionRequest -import com.aallam.openai.api.file.FileSource -import com.aallam.openai.api.model.ModelId -import com.aallam.openai.client.OpenAI +import androidx.datastore.preferences.core.Preferences import kotlinx.coroutines.* import kotlinx.coroutines.flow.first import kotlinx.coroutines.flow.map -import okio.FileSystem -import okio.Path.Companion.toPath +import okhttp3.Headers +import okhttp3.MediaType.Companion.toMediaTypeOrNull +import okhttp3.MultipartBody +import okhttp3.OkHttpClient +import okhttp3.Request +import okhttp3.RequestBody +import okhttp3.RequestBody.Companion.asRequestBody +import java.io.File class WhisperTranscriber { + private data class Config( + val endpoint: String, + val languageCode: String, + val isRequestStyleOpenaiApi: Boolean, + val apiKey: String + ) + private var currentTranscriptionJob: Job? = null fun startAsync( context: Context, filename: String, + mediaType: String, callback: (String?) -> Unit, exceptionCallback: (String) -> Unit ) { suspend fun makeWhisperRequest(): String { - val apiKey = context.dataStore.data.map { preferences -> - preferences[API_KEY] + // Retrieve configs + val (endpoint, languageCode, isRequestStyleOpenaiApi, apiKey) = context.dataStore.data.map { preferences: Preferences -> + Config( + preferences[ENDPOINT] ?: "", + preferences[LANGUAGE_CODE] ?: "en", + preferences[REQUEST_STYLE] ?: true, + preferences[API_KEY] ?: "" + ) }.first() - val openai = OpenAI( - token = apiKey ?: "", - ) - val request = TranscriptionRequest( - audio = FileSource( - name = filename, - source = FileSystem.SYSTEM.source(filename.toPath()) - ), - model = ModelId("whisper-1"), - language = "zh" + + // Make request + val client = OkHttpClient() + val request = buildWhisperRequest( + filename, + "$endpoint?encode=true&task=transcribe&language=$languageCode&word_timestamps=false&output=txt", + mediaType, + apiKey, + isRequestStyleOpenaiApi ) - val transcription = openai.transcription(request) + val response = client.newCall(request).execute() - return transcription.text + // If request is not successful, or response code is weird + if (!response.isSuccessful || response.code / 100 != 2) { + throw Exception(response.body!!.string().replace('\n', ' ')) + } + return response.body!!.string() } // Create a cancellable job in the main thread (for UI updating) @@ -79,4 +98,42 @@ class WhisperTranscriber { currentTranscriptionJob?.cancel() currentTranscriptionJob = job } + + private fun buildWhisperRequest( + filename: String, + url: String, + mediaType: String, + apiKey: String, + isRequestStyleOpenaiApi: Boolean + ): Request { + // Please refer to the following for the endpoint/payload definitions: + // - https://ahmetoner.com/whisper-asr-webservice/run/#usage + // - https://platform.openai.com/docs/api-reference/audio/createTranscription + // - https://platform.openai.com/docs/api-reference/making-requests + val file: File = File(filename) + val fileBody: RequestBody = file.asRequestBody(mediaType.toMediaTypeOrNull()) + val requestBody: RequestBody = MultipartBody.Builder().apply { + setType(MultipartBody.FORM) + addFormDataPart("audio_file", "@audio.m4a", fileBody) + + if (isRequestStyleOpenaiApi) { + addFormDataPart("file", "@audio.m4a", fileBody) + addFormDataPart("model", "whisper-1") + addFormDataPart("response_format", "text") + } + }.build() + + val requestHeaders: Headers = Headers.Builder().apply { + if (isRequestStyleOpenaiApi) { + add("Authorization", "Bearer $apiKey") + } + add("Content-Type", "multipart/form-data") + }.build() + + return Request.Builder() + .headers(requestHeaders) + .url(url) + .post(requestBody) + .build() + } } diff --git a/android/app/src/main/res/layout/activity_main.xml b/android/app/src/main/res/layout/activity_main.xml index 6527c4b..5b33753 100644 --- a/android/app/src/main/res/layout/activity_main.xml +++ b/android/app/src/main/res/layout/activity_main.xml @@ -7,75 +7,204 @@ android:layout_height="match_parent" tools:context=".MainActivity"> - - -