Skip to content

Commit

Permalink
feat:Adapt TongYi LLM speech transcription by Spring AI API (#3733)
Browse files Browse the repository at this point in the history
Signed-off-by: yuluo-yx <yuluo08290126@gmail.com>
Co-authored-by: yuluo-yx <yuluo08290126@gmail.com>
  • Loading branch information
n3A87 and yuluo-yx authored May 21, 2024
1 parent 4300772 commit c52d1cf
Show file tree
Hide file tree
Showing 36 changed files with 1,175 additions and 72 deletions.
1 change: 1 addition & 0 deletions 05-17-10-23-13.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
这是由阿里巴巴达摩院语音实验室提供的实时语音识别技术。
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,13 @@
import com.alibaba.cloud.ai.example.tongyi.models.ActorsFilms;
import com.alibaba.cloud.ai.example.tongyi.models.Completion;
import com.alibaba.cloud.ai.example.tongyi.service.TongYiService;
import com.alibaba.dashscope.audio.asr.transcription.TranscriptionParam;

import org.springframework.ai.chat.messages.AssistantMessage;
import org.springframework.ai.image.ImageResponse;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Qualifier;
import org.springframework.core.io.Resource;
import org.springframework.web.bind.annotation.CrossOrigin;
import org.springframework.web.bind.annotation.GetMapping;
import org.springframework.web.bind.annotation.RequestMapping;
Expand Down Expand Up @@ -138,6 +140,24 @@ public String genAudio(@RequestParam(value = "prompt",
return tongYiAudioService.genAudio(prompt);
}

@Autowired
@Qualifier("tongYiAudioTranscriptionServiceImpl")
private TongYiService tongYiAudioTranscriptionService;

/**
* audio transcription. Support urls audio resource.
* {@link Resource}
* {@link TranscriptionParam}
* @param url audio url.
* @return transcription result, is String type.
*/
@GetMapping("/audio/transcription")
public String audioTranscription(@RequestParam(value = "audioUrls",
defaultValue = "https://dashscope.oss-cn-beijing.aliyuncs.com/samples/audio/paraformer/realtime_asr_example.wav") String url) {

return tongYiAudioTranscriptionService.audioTranscription(url);
}

@Autowired
@Qualifier("tongYiTextEmbeddingServiceImpl")
private TongYiService tongYiTextEmbeddingService;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,4 +98,11 @@ public List<Double> textEmbedding(String text) {
.getStackTrace()[2].getMethodName() + INFO_SUFFIX);
}

@Override
public String audioTranscription(String url) {

throw new RuntimeException(INFO_PREFIX + Thread.currentThread()
.getStackTrace()[2].getMethodName() + INFO_SUFFIX);
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,13 @@ public interface TongYiService {
*/
String genAudio(String text);

/**
* Audio Transcription.
* @param audioUrls url of the audio file to be transcribed.
* @return the result file Path.
*/
String audioTranscription(String audioUrls);

/**
* TongYI LLM Text embedding.
* @param text input text.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

package com.alibaba.cloud.ai.example.tongyi.service.impl.audio;
package com.alibaba.cloud.ai.example.tongyi.service.impl.audio.speech;

import java.io.File;
import java.io.FileOutputStream;
Expand All @@ -24,7 +24,7 @@

import com.alibaba.cloud.ai.example.tongyi.service.AbstractTongYiServiceImpl;
import com.alibaba.cloud.ai.example.tongyi.service.TongYiService;
import com.alibaba.cloud.ai.tongyi.audio.api.SpeechClient;
import com.alibaba.cloud.ai.tongyi.audio.speech.api.SpeechClient;
import com.alibaba.dashscope.audio.tts.SpeechSynthesisAudioFormat;
import lombok.extern.slf4j.Slf4j;
import org.slf4j.Logger;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Spring Cloud Alibaba AI Audio Transcription

`TongYiController` 接受一个 HTTP GET 请求 `http://localhost:8080/ai/audio/transcription`
`controller` 将会调用 `TongYiService` 中的 `audioTranscription` 方法,完成服务请求得到响应。

可设置`file_urls`参数,提供一个或多个需要进行语音识别的音视频文件。

## 构建和运行

1. 修改配置文件 `application.yml` 中的 apikey 为有效的 apikey;
2. 通过 IDE 或者 `./mvnw spring-boot:run` 运行应用程序。

## 访问接口

使用 curl 工具对接口发起请求:

```shell
$ curl -X GET "http://localhost:8080/ai/audio/transcription?audioUrls=url1&audioUrls=url2"

# Response:
D:\Code\spring-cloud-alibaba\05-13-20-47-08.txt
D:\Code\spring-cloud-alibaba\05-13-20-47-09.txt
```

返回参数为保存到当前根路径下的音频转录文本文件的路径。
Original file line number Diff line number Diff line change
@@ -0,0 +1,123 @@
/*
* Copyright 2023-2024 the original author or authors.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* https://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.alibaba.cloud.ai.example.tongyi.service.impl.audio.transcription;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.net.HttpURLConnection;
import java.net.URL;
import java.time.LocalDateTime;
import java.time.format.DateTimeFormatter;
import java.util.List;

import com.alibaba.cloud.ai.example.tongyi.service.AbstractTongYiServiceImpl;
import com.alibaba.cloud.ai.example.tongyi.service.TongYiService;
import com.alibaba.cloud.ai.tongyi.audio.transcription.TongYiAudioTranscriptionClient;
import com.alibaba.cloud.ai.tongyi.audio.transcription.api.AudioTranscriptionPrompt;
import com.alibaba.cloud.ai.tongyi.audio.transcription.api.AudioTranscriptionResult;
import com.google.gson.JsonArray;
import com.google.gson.JsonObject;
import com.google.gson.JsonParser;
import lombok.extern.slf4j.Slf4j;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.core.io.Resource;
import org.springframework.core.io.UrlResource;
import org.springframework.stereotype.Service;

/**
* @author xYLiu
* @since 2024/5/15 14:55
*/
@Slf4j
@Service
public class TongYiAudioTranscriptionServiceImpl extends AbstractTongYiServiceImpl {
private static final Logger logger = LoggerFactory.getLogger(TongYiService.class);
private final TongYiAudioTranscriptionClient audioTranscriptionClient;

@Autowired
public TongYiAudioTranscriptionServiceImpl(final TongYiAudioTranscriptionClient audioTranscriptionClient) {
this.audioTranscriptionClient = audioTranscriptionClient;
}

@Override
public String audioTranscription(String audioUrls) {

Resource resource;

try {
resource = new UrlResource(audioUrls);
}
catch (IOException e) {
logger.error("Failed to create resource.");
throw new RuntimeException(e);
}
AudioTranscriptionPrompt audioTranscriptionPrompt = new AudioTranscriptionPrompt(resource);

return save(audioTranscriptionClient.call(audioTranscriptionPrompt).getResults());
}

private String save(List<AudioTranscriptionResult> resultList) {
String currentPath = System.getProperty("user.dir");
DateTimeFormatter formatter = DateTimeFormatter.ofPattern("MM-dd-HH-mm-ss");
StringBuilder retPaths = new StringBuilder();
for (AudioTranscriptionResult audioTranscriptionResult : resultList) {
String tUrl = audioTranscriptionResult.getOutput();
LocalDateTime now = LocalDateTime.now();
String fileName = currentPath + File.separator + now.format(formatter) + ".txt";
retPaths.append(fileName).append("\n");
try {
URL url = new URL(tUrl);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
StringBuilder sb = new StringBuilder();
int responseCode = connection.getResponseCode();
if (responseCode == HttpURLConnection.HTTP_OK) {
try (BufferedInputStream in = new BufferedInputStream(connection.getInputStream()); FileOutputStream fileOutputStream = new FileOutputStream(fileName)) {
byte[] dataBuffer = new byte[1024];
int bytesRead;
while ((bytesRead = in.read(dataBuffer, 0, 1024)) != -1) {
sb.append(new String(dataBuffer, 0, bytesRead));
}
JsonObject rootObj = JsonParser.parseString(sb.toString()).getAsJsonObject();
JsonArray transcriptsArray = rootObj.getAsJsonArray("transcripts");

for (var transcriptElement : transcriptsArray) {
JsonObject transcriptObj = transcriptElement.getAsJsonObject();
String text = transcriptObj.get("text").getAsString();
fileOutputStream.write(text.getBytes());
}
logger.info("File downloaded successfully:{}\n", fileName);
}
}
else {
logger.error("The download failed, and the response code:{}",
responseCode);
}
connection.disconnect();
}
catch (IOException e) {
logger.error("An error occurred during the file download process.");
}
}
return retPaths.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ spring:
application:
name: tongyi-example

cloud:
ai:
tongyi:
api-key: sk-0e6c387446ff45d0924111475a82462e
# please setting api-key. suggestion by environment variable.
# Note: api-key is invalid, please apply for a new one.
# export SPRING_CLOUD_AI_TONGYI_API_KEY=sk-a3d73b1709bf4a178c28ed7c8b3b5a345

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,10 @@

import java.util.Objects;

import com.alibaba.cloud.ai.tongyi.audio.TongYiAudioSpeechClient;
import com.alibaba.cloud.ai.tongyi.audio.TongYiAudioSpeechProperties;
import com.alibaba.cloud.ai.tongyi.audio.speech.TongYiAudioSpeechClient;
import com.alibaba.cloud.ai.tongyi.audio.speech.TongYiAudioSpeechProperties;
import com.alibaba.cloud.ai.tongyi.audio.transcription.TongYiAudioTranscriptionClient;
import com.alibaba.cloud.ai.tongyi.audio.transcription.TongYiAudioTranscriptionProperties;
import com.alibaba.cloud.ai.tongyi.chat.TongYiChatClient;
import com.alibaba.cloud.ai.tongyi.chat.TongYiChatProperties;
import com.alibaba.cloud.ai.tongyi.constants.TongYiConstants;
Expand All @@ -30,6 +32,7 @@
import com.alibaba.cloud.ai.tongyi.image.TongYiImagesProperties;
import com.alibaba.dashscope.aigc.generation.Generation;
import com.alibaba.dashscope.aigc.imagesynthesis.ImageSynthesis;
import com.alibaba.dashscope.audio.asr.transcription.Transcription;
import com.alibaba.dashscope.audio.tts.SpeechSynthesizer;
import com.alibaba.dashscope.common.MessageManager;
import com.alibaba.dashscope.embeddings.TextEmbedding;
Expand Down Expand Up @@ -58,14 +61,16 @@
MessageManager.class,
TongYiChatClient.class,
TongYiImagesClient.class,
TongYiAudioSpeechClient.class
TongYiAudioSpeechClient.class,
TongYiAudioTranscriptionClient.class
})
@EnableConfigurationProperties({
TongYiChatProperties.class,
TongYiImagesProperties.class,
TongYiAudioSpeechProperties.class,
TongYiConnectionProperties.class,
TongYiTextEmbeddingProperties.class
TongYiTextEmbeddingProperties.class,
TongYiAudioTranscriptionProperties.class
})
public class TongYiAutoConfiguration {

Expand Down Expand Up @@ -101,6 +106,13 @@ public SpeechSynthesizer speechSynthesizer() {
return new SpeechSynthesizer();
}

@Bean
@ConditionalOnMissingBean
public Transcription transcription() {

return new Transcription();
}

@Bean
@ConditionalOnMissingBean
public TextEmbedding textEmbedding() {
Expand Down Expand Up @@ -173,19 +185,37 @@ public TongYiAudioSpeechClient tongYiAudioSpeechClient(

@Bean
@ConditionalOnProperty(
prefix = TongYiAudioSpeechProperties.CONFIG_PREFIX,
prefix = TongYiAudioTranscriptionProperties.CONFIG_PREFIX,
name = "enabled",
havingValue = "true",
matchIfMissing = true
)
public TongYiAudioTranscriptionClient tongYiAudioTranscriptionClient(
Transcription transcription,
TongYiAudioTranscriptionProperties transcriptionProperties,
TongYiConnectionProperties connectionProperties) {

settingApiKey(connectionProperties);

return new TongYiAudioTranscriptionClient(
transcriptionProperties.getOptions(),
transcription
);
}

@Bean
@ConditionalOnProperty(
prefix = TongYiTextEmbeddingProperties.CONFIG_PREFIX,
name = "enabled",
havingValue = "true",
matchIfMissing = true
)
public TongYiTextEmbeddingClient tongYiTextEmbeddingClient(
TextEmbedding textEmbedding,
TongYiTextEmbeddingProperties textEmbeddingProperties,
TongYiConnectionProperties connectionProperties
) {

settingApiKey(connectionProperties);

return new TongYiTextEmbeddingClient(textEmbedding);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@
* @since 2023.0.0.0-RC1
*/

public final class AudioModels {
public final class AudioSpeechModels {

private AudioModels() {
private AudioSpeechModels() {
}

/**
Expand All @@ -37,8 +37,4 @@ private AudioModels() {
*/
public static final String SAMBERT_ZHICHU_V1 = "sambert-zhichu-v1";



}


Loading

0 comments on commit c52d1cf

Please sign in to comment.