// Copyright (C) 2025 Intel Corporation
// SPDX-License-Identifier: Apache-2.0

#pragma once

#include "openvino/genai/perf_metrics.hpp"
#include "openvino/genai/visibility.hpp"


namespace ov::genai {

/**
 * @brief Performance metrics for each generate call of Speculative Decoding Pipeline.
 * It is extended version of PerfMetrics.
 *
 * Cached mean and standard deviations.
 * @param ttst Mean and standard deviation of Time to the Second Token (TTST) in milliseconds.
 * The second token is presented separately as for some plugins this can be expected to take longer than next tokens.
 * In case of GPU plugin: Async compilation of some opt kernels can be completed after second token.
 *                        Also, additional memory manipulation can happen at second token time.
 * @param avg_latency Mean and standard deviation of the latency from the third token in milliseconds per inference, which includes also prev and post processing.
 * The latency of first and second tokens can be obtained with methods get_ttft() and get_ttst().
 * First and second tokens take longer than next tokens, since the first token includes prompt analysis,
 * and they both can include additional memory and compilation manipulations,
 * so let's exclude it from total statistic.
 * 
 * Additional points:
 * TPOT is calculated from the third token. The reasons for this, please, see in the description for avg_latency.
 * To get number of iterations, you can use value from raw performance metrics raw_metrics.m_durations.size().
 */
struct OPENVINO_GENAI_EXPORTS SDPerfMetrics : public ov::genai::ExtendedPerfMetrics {
    ov::genai::MeanStdPair ttst;  // Time of the generation of the second token (in ms).
    ov::genai::MeanStdPair avg_latency;  // Latency from the third token (in ms), first and second token time is presented separately as ttft and ttst.

    /**
    * @brief returns the mean and standard deviation of TTST(Time to the second token).
    */
    ov::genai::MeanStdPair get_ttst();

    /**
    * @brief returns the mean and standard deviation of the latency(avg_latency).
    */
    ov::genai::MeanStdPair get_latency();

    /**
    * @brief calculates mean/std values from raw_metrics.
    *
    * @param start_time optional start_time in case if duration needs to be updated.
    */
    void evaluate_statistics(std::optional<TimePoint> start_time = std::nullopt) override;
};

/**
 * @brief Performance metrics for main and draft model of Speculative Decoding Pipeline.
 *
 * SDPerModelsPerfMetrics includes metrics for main model and draft model.
 * @param main_model_metrics A structure of SDPerfMetrics type that holds performance metrics for main model.
 * @param draft_model_metrics A structure of SDPerfMetrics type that holds performance metrics for draft model.
 * @param num_accepted_tokens the number of tokens generated by the draft model and accepted by the main model.
 *
 */
struct OPENVINO_GENAI_EXPORTS SDPerModelsPerfMetrics : public ov::genai::SDPerfMetrics {
    ov::genai::SDPerfMetrics main_model_metrics; // perf metrics of the main model
    ov::genai::SDPerfMetrics draft_model_metrics; // perf metrics of the draft model
    size_t num_accepted_tokens; // num tokens, which was accepted by main model.

    SDPerModelsPerfMetrics();

    /**
    * @param returns the number of tokens tokens generated by the draft model and accepted by the main model.
    *        This value is not equal to the number of generated tokens, since the main model generates one own tokens on each inference.
    *        Thus, number of generated tokens = SUM(number of accepted tokens(generated by draft model and accepted by main model) + 1 token(generated by main model))
    */
    size_t get_num_accepted_tokens();

    /**
    * @brief calculates mean/std values from raw_metrics.
    *
    * @param start_time optional start_time in case if duration needs to be updated.
    */
    void evaluate_statistics(std::optional<TimePoint> start_time = std::nullopt) override;
};

}