主动介入

package nexus.io.voice.agent.callback;

import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;

public class CallbackExecutorService {

  public static final ScheduledExecutorService SHARED_SCHEDULER =
      Executors.newScheduledThreadPool(1, r -> {
        Thread t = new Thread(r, "ws-realtime-bridge-callback-scheduler");
        t.setDaemon(true);
        return t;
      });
}

CallbackPromptUtils

package nexus.io.voice.agent.callback;

public class CallbackPromptUtils {

  public static String buildProactiveInterventionPrompt(String lastAssistantText, String lastUserText, long idleMs) {
    long idleSec = Math.max(1L, idleMs / 1000L);

    String assistantContext = emptyToDefault(lastAssistantText, "无");
    String userContext = emptyToDefault(lastUserText, "无");

    return "" + "系统提示：当前是实时语音场景。\n"
    //
        + "模型刚刚已经完成了一轮提问或回应，直到现在用户已经沉默了 " + idleSec + " 秒，仍未开始正式回答。\n"
        //
        + "请你根据当前上下文主动介入，但要自然、简洁、像真人模型，不要机械重复。\n"
        //
        + "你的目标是推动对话继续进行。\n" + "你可以视上下文选择：\n"
        //
        + "1. 温和提醒用户继续回答；\n"
        //
        + "2. 如果用户可能卡住了，给一个轻微引导；\n"
        //
        + "3. 如果用户已回答过部分内容，可基于他的内容继续追问；\n"
        //
        + "4. 如果问题较难，也可以建议先给简短结论再展开。\n"
        //
        + "请直接输出你要对用户说的话，不要解释策略。\n"
        //
        + "最近一轮模型内容：" + assistantContext + "\n"
        //
        + "最近用户内容：" + userContext;
  }

  private static String emptyToDefault(String value, String dft) {
    return value == null || value.trim().isEmpty() ? dft : value.trim();
  }

}

WsRealtimeBridgeCallback

package nexus.io.voice.agent.callback;

import java.nio.file.Path;
import java.util.concurrent.ScheduledFuture;
import java.util.concurrent.TimeUnit;
import java.util.concurrent.atomic.AtomicBoolean;
import java.util.function.Consumer;

import nexus.io.media.NativeMedia;
import nexus.io.tio.core.ChannelContext;
import nexus.io.tio.core.Tio;
import nexus.io.tio.utils.json.JsonUtils;
import nexus.io.tio.websocket.common.WebSocketResponse;
import nexus.io.voice.agent.audio.AudioFinishCallback;
import nexus.io.voice.agent.audio.SessionAudioRecorder;
import nexus.io.voice.agent.bridge.RealtimeBridgeCallback;
import nexus.io.voice.agent.bridge.RealtimeSetup;
import nexus.io.voice.agent.consts.VoiceAgentConst;
import nexus.io.voice.agent.model.WsVoiceAgentResponseMessage;
import nexus.io.voice.agent.utils.ChannelContextUtils;

import lombok.extern.slf4j.Slf4j;

@Slf4j
public class WsRealtimeBridgeCallback implements RealtimeBridgeCallback {

  private volatile ScheduledFuture<?> proactiveFuture;

  private final ChannelContext channelContext;
  private final String sessionId;

  /**
   * 是否开启主动介入
   */
  private volatile boolean proactiveInterventionEnabled = false;

  /**
   * assistant 完成回复后，用户沉默多久开始主动介入
   */
  private volatile long proactiveInterventionTimeoutMs = 8_000L;

  /**
   * 两次主动介入之间的最小间隔
   */
  private volatile long proactiveInterventionRepeatMs = 8_000L;

  /**
   * 当前是否处于“assistant 已说完，等待用户回答”的阶段
   */
  private volatile boolean waitingForUserAnswer = false;

  /**
   * 最近一次 assistant 完成一轮回复的时间
   */
  private volatile long lastAssistantTurnCompleteAt = 0L;

  /**
   * 最近一次真实检测到用户说话/输入文本的时间
   */
  private volatile long lastRealUserSpeechAt = 0L;

  /**
   * 最近一次 assistant 活动时间
   */
  private volatile long lastAssistantActivityAt = 0L;

  /**
   * 最近一次任意活动时间
   */
  private volatile long lastActivityAt = System.currentTimeMillis();

  /**
   * 最近一次主动介入时间
   */
  private volatile long lastProactiveInterventionAt = 0L;

  /**
   * 最近一次 assistant 文本
   */
  private volatile String lastAssistantText = "";

  /**
   * 最近一次用户文本
   */
  private volatile String lastUserText = "";

  /**
   * 是否已经关闭
   */
  private volatile boolean closed = false;

  /**
   * 由 handler 注入，真正把文本发送给模型
   */
  private volatile Consumer<String> modelTextSender;

  private final AtomicBoolean proactiveTaskStarted = new AtomicBoolean(false);

  public WsRealtimeBridgeCallback(ChannelContext channelContext) {
    this.channelContext = channelContext;
    this.sessionId = ChannelContextUtils.key(channelContext);
  }

  public void bindModelTextSender(Consumer<String> modelTextSender) {
    this.modelTextSender = modelTextSender;
  }

  public void configureProactiveIntervention(boolean enabled, long timeoutMs, long repeatMs) {
    this.proactiveInterventionEnabled = enabled;
    if (timeoutMs > 0) {
      this.proactiveInterventionTimeoutMs = timeoutMs;
    }
    if (repeatMs > 0) {
      this.proactiveInterventionRepeatMs = repeatMs;
    }
  }

  /**
   * 仅表示有音频流在上传，不代表用户真的开口。
   * 所以这里不改变 waitingForUserAnswer，不参与“沉默结束”判断。
   */
  public void onUserAudioActivity() {
    this.lastActivityAt = System.currentTimeMillis();
  }

  /**
   * 用户明确发送文本输入，视为真实回答。
   */
  public void onUserTextActivity(String text) {
    this.lastUserText = safeText(text);
    markRealUserSpeechActivity("user_text_input");
  }

  @Override
  public void sendText(String json) {
    inspectServerEvent(json);

    WebSocketResponse wsResp = WebSocketResponse.fromText(json, VoiceAgentConst.CHARSET);
    Tio.send(channelContext, wsResp);
  }

  @Override
  public void sendBinary(byte[] bytes) {
    try {
      SessionAudioRecorder.appendModelPcm(sessionId, bytes);
    } catch (Exception ex) {
      log.warn("record model pcm failed: {}", ex.getMessage());
    }

    markAssistantActivity();

    WebSocketResponse wsResp = WebSocketResponse.fromBytes(bytes);
    Tio.send(channelContext, wsResp);
  }

  @Override
  public void close(String reason) {
    closed = true;

    try {
      proactiveFuture.cancel(true);
    } catch (Exception e) {
      log.warn("shutdown scheduler failed: {}", e.getMessage());
    }

    AudioFinishCallback audioFinishCallback = new AudioFinishCallback() {
      @Override
      public void done(Path audioFile) {
        String wavFilePath = audioFile.toString();
        NativeMedia.toMp3(wavFilePath);
      }
    };

    SessionAudioRecorder.stop(sessionId, audioFinishCallback);
    Tio.remove(channelContext, reason);
  }

  @Override
  public void session(String sessionId) {
  }

  /**
   * 如果 bridge 显式调用了 turnComplete，这里直接用。
   */
  @Override
  public void turnComplete(String role, String text) {
    if (closed) {
      return;
    }

    if ("assistant".equalsIgnoreCase(role) || "model".equalsIgnoreCase(role)) {
      this.lastAssistantText = safeText(text);
      enterWaitingForUserAnswer("turnComplete(role=assistant)");
    } else if ("user".equalsIgnoreCase(role)) {
      this.lastUserText = safeText(text);
      markRealUserSpeechActivity("turnComplete(role=user)");
    }
  }

  @Override
  public void start(RealtimeSetup setup) {
    startProactiveTaskIfNeeded();
  }

  private void startProactiveTaskIfNeeded() {
    if (!proactiveTaskStarted.compareAndSet(false, true)) {
      return;
    }

    proactiveFuture = CallbackExecutorService.SHARED_SCHEDULER.scheduleAtFixedRate(() -> {
      try {
        checkAndTriggerProactiveIntervention();
      } catch (Throwable e) {
        log.warn("checkAndTriggerProactiveIntervention error, sessionId:{}", sessionId, e);
      }
    }, 1, 1, TimeUnit.SECONDS);
  }

  private void checkAndTriggerProactiveIntervention() {
    if (closed) {
      return;
    }

    if (!proactiveInterventionEnabled) {
      return;
    }

    if (!waitingForUserAnswer) {
      return;
    }

    Consumer<String> sender = this.modelTextSender;
    if (sender == null) {
      return;
    }

    if (lastAssistantTurnCompleteAt <= 0L) {
      return;
    }

    long now = System.currentTimeMillis();
    long idleMs = now - lastAssistantTurnCompleteAt;

    if (idleMs < proactiveInterventionTimeoutMs) {
      return;
    }

    long sinceLastIntervention = now - lastProactiveInterventionAt;
    if (lastProactiveInterventionAt > 0L && sinceLastIntervention < proactiveInterventionRepeatMs) {
      return;
    }

    String interventionPrompt = CallbackPromptUtils.buildProactiveInterventionPrompt(lastAssistantText, lastUserText,
        idleMs);

    log.info(
        "trigger proactive intervention, sessionId:{}, idleMs:{}, waitingForUserAnswer:{}, lastAssistantTurnCompleteAt:{}",
        sessionId, idleMs, waitingForUserAnswer, lastAssistantTurnCompleteAt);

    lastProactiveInterventionAt = now;

    try {
      sender.accept(interventionPrompt);
      markAssistantActivity();
    } catch (Exception e) {
      log.warn("modelTextSender.accept failed, sessionId:{}, prompt:{}", sessionId, interventionPrompt, e);
    }
  }

  private void inspectServerEvent(String json) {
    if (json == null || json.isEmpty()) {
      return;
    }

    try {
      WsVoiceAgentResponseMessage msg = JsonUtils.parse(json, WsVoiceAgentResponseMessage.class);
      if (msg == null || msg.getType() == null) {
        return;
      }

      String type = msg.getType();

      if ("transcript_in".equalsIgnoreCase(type)) {
        this.lastUserText = safeText(msg.getText());
        markRealUserSpeechActivity("transcript_in");
        return;
      }

      if ("speech_started".equalsIgnoreCase(type)) {
        markRealUserSpeechActivity("speech_started");
        return;
      }

      if ("transcript_out".equalsIgnoreCase(type) || "text".equalsIgnoreCase(type)) {
        this.lastAssistantText = safeText(msg.getText());
        markAssistantActivity();
        return;
      }

      if ("assistant_turn_start".equalsIgnoreCase(type)) {
        markAssistantActivity();
        return;
      }

      if ("assistant_turn_complete".equalsIgnoreCase(type) || "turn_complete".equalsIgnoreCase(type)) {
        enterWaitingForUserAnswer(type);
        return;
      }

      if ("assistant_turn_interrupt".equalsIgnoreCase(type) || "interrupted".equalsIgnoreCase(type)) {
        markRealUserSpeechActivity(type);
        return;
      }

      if ("error".equalsIgnoreCase(type) || "go_away".equalsIgnoreCase(type)) {
        markAssistantActivity();
      }
    } catch (Exception e) {
      log.debug("inspectServerEvent parse failed, sessionId:{}, json:{}", sessionId, json);
    }
  }

  private void enterWaitingForUserAnswer(String reason) {
    long now = System.currentTimeMillis();
    this.waitingForUserAnswer = true;
    this.lastAssistantTurnCompleteAt = now;
    this.lastActivityAt = now;

    log.info("enter waitingForUserAnswer, sessionId:{}, reason:{}, proactiveEnabled:{}, lastAssistantText:{}",
        sessionId, reason, proactiveInterventionEnabled, shortText(lastAssistantText));
  }

  private void markRealUserSpeechActivity(String reason) {
    long now = System.currentTimeMillis();
    this.lastRealUserSpeechAt = now;
    this.lastActivityAt = now;
    this.waitingForUserAnswer = false;

    log.info("mark real user speech activity, sessionId:{}, reason:{}, lastUserText:{}", sessionId, reason,
        shortText(lastUserText));
  }

  private void markAssistantActivity() {
    long now = System.currentTimeMillis();
    this.lastAssistantActivityAt = now;
    this.lastActivityAt = now;
  }

  private String safeText(String text) {
    return text == null ? "" : text.trim();
  }

  private String shortText(String text) {
    if (text == null) {
      return "";
    }
    String s = text.trim();
    return s.length() <= 120 ? s : s.substring(0, 120) + "...";
  }
}

VoiceSocketHandler

package nexus.io.voice.agent.handler;

import java.util.Map;
import java.util.concurrent.ConcurrentHashMap;

import nexus.io.tio.consts.TioConst;
import nexus.io.tio.core.ChannelContext;
import nexus.io.tio.core.Tio;
import nexus.io.tio.http.common.HttpRequest;
import nexus.io.tio.http.common.HttpResponse;
import nexus.io.tio.utils.json.JsonUtils;
import nexus.io.tio.websocket.common.WebSocketRequest;
import nexus.io.tio.websocket.common.WebSocketResponse;
import nexus.io.tio.websocket.common.WebSocketSessionContext;
import nexus.io.tio.websocket.server.handler.IWebSocketHandler;
import nexus.io.voice.agent.audio.SessionAudioRecorder;
import nexus.io.voice.agent.bridge.RealtimeModelBridge;
import nexus.io.voice.agent.bridge.RealtimeModelBridgeFactory;
import nexus.io.voice.agent.bridge.RealtimeSetup;
import nexus.io.voice.agent.callback.WsRealtimeBridgeCallback;
import nexus.io.voice.agent.model.WsVoiceAgentRequestMessage;
import nexus.io.voice.agent.model.WsVoiceAgentResponseMessage;
import nexus.io.voice.agent.model.WsVoiceAgentType;
import nexus.io.voice.agent.utils.ChannelContextUtils;

import lombok.extern.slf4j.Slf4j;

@Slf4j
public class VoiceSocketHandler implements IWebSocketHandler {

  /**
   * 一个前端连接一个 bridge
   */
  private static final Map<String, RealtimeModelBridge> BRIDGES = new ConcurrentHashMap<>();

  /**
   * 一个前端连接一个 callback
   */
  private static final Map<String, WsRealtimeBridgeCallback> CALLBACKS = new ConcurrentHashMap<>();

  /**
   * 主动介入总开关
   */
  private static final boolean ENABLE_PROACTIVE_INTERVENTION = true;

  /**
   * assistant 完成回复后，用户沉默多久开始主动介入
   */
  private static final long PROACTIVE_INTERVENTION_TIMEOUT_MS = 8_000L;

  /**
   * 两次主动介入之间的最小间隔
   */
  private static final long PROACTIVE_INTERVENTION_REPEAT_MS = 8_000L;

  @Override
  public HttpResponse handshake(HttpRequest httpRequest, HttpResponse response, ChannelContext channelContext)
      throws Exception {
    log.info("请求信息: {}", httpRequest);
    return response;
  }

  @Override
  public void onAfterHandshaked(HttpRequest httpRequest, HttpResponse httpResponse, ChannelContext channelContext)
      throws Exception {
    log.info("握手完成: {}", httpRequest);
  }

  @Override
  public Object onClose(WebSocketRequest wsRequest, byte[] bytes, ChannelContext channelContext) throws Exception {
    String sessionKey = ChannelContextUtils.key(channelContext);
    cleanupSession(channelContext, sessionKey, "客户端主动关闭连接");
    return null;
  }

  @Override
  public Object onBytes(WebSocketRequest wsRequest, byte[] bytes, ChannelContext channelContext) throws Exception {
    String sessionKey = ChannelContextUtils.key(channelContext);

    // 这里只表示“麦克风流有数据”，不代表用户真的开口，所以只做轻量触达
    WsRealtimeBridgeCallback callback = CALLBACKS.get(sessionKey);
    if (callback != null) {
      callback.onUserAudioActivity();
    }

    try {
      SessionAudioRecorder.appendUserPcm(sessionKey, bytes);
    } catch (Exception ex) {
      log.warn("appendUserPcm failed: {}", ex.getMessage());
    }

    RealtimeModelBridge bridge = BRIDGES.get(sessionKey);
    if (bridge != null) {
      try {
        bridge.sendPcm16k(bytes);
      } catch (Exception e) {
        log.error("bridge.sendPcm16k error, sessionKey:{}", sessionKey, e);
      }
    } else {
      log.warn("bridge not found when onBytes, sessionKey:{}", sessionKey);
    }

    return null;
  }

  @Override
  public Object onText(WebSocketRequest wsRequest, String text, ChannelContext channelContext) throws Exception {
    WebSocketSessionContext wsSessionContext = (WebSocketSessionContext) channelContext.get();
    String path = wsSessionContext.getHandshakeRequest().getRequestLine().path;
    log.info("路径：{}，收到消息：{}", path, text);

    String rawText = text == null ? "" : text.trim();

    WsVoiceAgentRequestMessage msg = null;
    try {
      msg = JsonUtils.parse(rawText, WsVoiceAgentRequestMessage.class);
    } catch (Exception je) {
      log.debug("收到非 JSON 文本或无法解析为 WsMessage: {}", je.getMessage());
      return null;
    } catch (Throwable e) {
      log.error("解析收到的消息异常", e);
      return null;
    }

    String sessionKey = ChannelContextUtils.key(channelContext);
    RealtimeModelBridge bridge = BRIDGES.get(sessionKey);

    if (bridge == null && msg != null && msg.getType() != null) {
      WsVoiceAgentType typeEnum = parseType(msg.getType());

      if (typeEnum == WsVoiceAgentType.SETUP) {
        String platform = msg.getPlatform();
        String systemPrompt = msg.getSystem_prompt();
        String userPrompt = msg.getUser_prompt();
        String jobDescription = msg.getJob_description();
        String resume = msg.getResume();
        String questions = msg.getQuestions();
        String greeting = msg.getGreeting();

        RealtimeSetup realtimeSetup = new RealtimeSetup(systemPrompt, userPrompt, jobDescription, resume, questions,
            greeting);

        connectLLM(channelContext, platform, realtimeSetup);

        WsVoiceAgentResponseMessage resp = new WsVoiceAgentResponseMessage(WsVoiceAgentType.SETUP_RECEIVED.name());
        resp.setSessionId(sessionKey);

        String json = toJson(resp);
        Tio.send(channelContext, WebSocketResponse.fromText(json, TioConst.UTF_8));
      } else {
        log.warn("bridge not ready and first message is not SETUP, sessionKey:{}, type:{}", sessionKey, msg.getType());
      }

      return null;
    }

    if (bridge == null) {
      String respJson = toJson(new WsVoiceAgentResponseMessage(WsVoiceAgentType.ERROR.name(), "no bridge"));
      Tio.send(channelContext, WebSocketResponse.fromText(respJson, TioConst.UTF_8));
      return null;
    }

    try {
      if (msg != null && msg.getType() != null) {
        WsVoiceAgentType typeEnum = parseType(msg.getType());

        if (typeEnum != null) {
          switch (typeEnum) {
          case AUDIO_END: {
            bridge.endAudioInput();
            break;
          }

          case TEXT: {
            String userText = msg.getText() == null ? "" : msg.getText();

            WsRealtimeBridgeCallback callback = CALLBACKS.get(sessionKey);
            if (callback != null) {
              callback.onUserTextActivity(userText);
            }

            bridge.sendText(userText);
            break;
          }

          case CLOSE: {
            cleanupSession(channelContext, sessionKey, "client requested close");
            break;
          }

          default: {
            Tio.send(channelContext,
                WebSocketResponse.fromText(
                    toJson(new WsVoiceAgentResponseMessage(WsVoiceAgentType.IGNORED.name(), rawText)),
                    TioConst.UTF_8));
            break;
          }
          }
        } else {
          log.debug("未知的 type: {}", msg.getType());
        }
      }
    } catch (Exception e) {
      log.error("onText handle error, sessionKey:{}", sessionKey, e);
    }

    return null;
  }

  private void connectLLM(ChannelContext channelContext, String platform, RealtimeSetup setup) {
    String sessionKey = ChannelContextUtils.key(channelContext);

    WsRealtimeBridgeCallback callback = new WsRealtimeBridgeCallback(channelContext);
    callback.configureProactiveIntervention(ENABLE_PROACTIVE_INTERVENTION, PROACTIVE_INTERVENTION_TIMEOUT_MS,
        PROACTIVE_INTERVENTION_REPEAT_MS);

    try {
      SessionAudioRecorder.start(sessionKey, 16000, 24000);
    } catch (Exception e) {
      log.warn("start recorder failed: {}", e.getMessage());
    }

    RealtimeModelBridge bridge = RealtimeModelBridgeFactory.createBridge(platform, callback);

    callback.bindModelTextSender(prompt -> {
      try {
        RealtimeModelBridge b = BRIDGES.get(sessionKey);
        if (b != null) {
          b.sendText(prompt);
        } else {
          log.warn("bridge not found when proactive intervention, sessionKey:{}", sessionKey);
        }
      } catch (Exception e) {
        log.warn("bridge.sendText failed, sessionKey:{}, prompt:{}", sessionKey, prompt, e);
      }
    });

    callback.start(setup);

    CALLBACKS.put(sessionKey, callback);
    BRIDGES.put(sessionKey, bridge);

    try {
      bridge.connect(setup);
    } catch (Exception e) {
      log.error("bridge.connect error, sessionKey:{}", sessionKey, e);
      cleanupSession(channelContext, sessionKey, "bridge connect failed");
    }
  }

  private void cleanupSession(ChannelContext channelContext, String sessionKey, String reason) {
    WsRealtimeBridgeCallback callback = CALLBACKS.remove(sessionKey);
    RealtimeModelBridge bridge = BRIDGES.remove(sessionKey);

    if (bridge != null) {
      try {
        bridge.close();
      } catch (Exception e) {
        log.warn("bridge.close error, sessionKey:{}", sessionKey, e);
      }
      return;
    }

    if (callback != null) {
      try {
        callback.close(reason);
      } catch (Exception e) {
        log.warn("callback.close error, sessionKey:{}", sessionKey, e);
      }
      return;
    }

    try {
      Tio.remove(channelContext, reason);
    } catch (Exception e) {
      log.warn("Tio.remove error, sessionKey:{}", sessionKey, e);
    }
  }

  private WsVoiceAgentType parseType(String type) {
    if (type == null) {
      return null;
    }

    try {
      return WsVoiceAgentType.valueOf(type.trim().toUpperCase());
    } catch (Exception e) {
      return null;
    }
  }

  private String toJson(WsVoiceAgentResponseMessage wsVoiceAgentResponseMessage) {
    return JsonUtils.toSkipNullJson(wsVoiceAgentResponseMessage);
  }
}