-- Copyright (c) 2024 Huawei Technologies Co., Ltd.
-- openUBMC is licensed under Mulan PSL v2.
-- You can use this software according to the terms and conditions of the Mulan PSL v2.
-- You may obtain a copy of Mulan PSL v2 at:
--         http://license.coscl.org.cn/MulanPSL2
-- THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
-- EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-- MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-- See the Mulan PSL v2 for more details.

local class = require 'mc.class'
local log = require 'mc.logging'
local singleton = require 'mc.singleton'
local drive_collection = require 'drive.drive_collection'
local common_def = require 'common_def'
local sml = require 'sml'
local task = require "task_service"
local task_mgmt = require 'mc.mdb.task_mgmt'
local skynet = require 'skynet'
local custom_error = require 'messages.custom'
local mdb = require 'mc.mdb'

local rpc_service_subhealth = class()
local TASK_ATTACH_PATH = '/bmc/kepler/Systems/1/Storage/Drives'

local io_key_names = {
    'drive_name', 'media_type', 'manufacturer', 'serial_number', 'protocol', 'ref_controller_id',
    'glist_cnt', 'plist_cnt', 'unc_static', 'media_error', 'other_error', 'cmd_timeout', 'unexpected_sense',
    'bit_error_rate_zone', 'flyheight_clearance_delta_outer', 'flyheight_clearance_delta_inner',
    'flyheight_clearance_delta_middle', 'cur_farm', 'factory_farm'
}

local estimated_lifespan_key_names = {
    'drive_name', 'media_type', 'manufacturer', 'serial_number', 'protocol', 'is_support_hw_defined',
    'ref_controller_id', 'slc_avg_ec', 'tlc_avg_ec', 'slc_poh', 'tlc_poh', 'slc_pe_cycle', 'tlc_pe_cycle',
    'remn_wearout', 'power_on_hours', 'slc_used_lifespan', 'tlc_used_lifespan'
}

function rpc_service_subhealth:ctor(bus)
    self.bus = bus
    self.task_id = nil
    self.drives_collect_tasks = {}
end

local function get_drive_io_deterioration_diagnose_info(drive)
    local basic_info = {
        log_source = 0,
        log_rotate_num = 7,
        pd_power_state = drive.PowerState or common_def.INVALID_U8,
        pd_interface_type = drive.Protocol or common_def.INVALID_U8,
        pd_media_type = drive.MediaType or common_def.INVALID_U8,
        pd_rebuild_progress = drive.RebuildProgress or common_def.INVALID_U8,
        pd_health = drive.Health or common_def.INVALID_U8,

        pd_prefail_error_count = drive.PredictedFailCount or common_def.INVALID_U32,
        pd_media_error_count = drive.MediaErrorCount or common_def.INVALID_U32,
        pd_other_error_count = drive.OtherErrorCount or common_def.INVALID_U32,

        pd_device_name = drive.Name or 'N/A',
        pd_interface_type_str = common_def.DRIVE_PROTOCOL_STR[drive.Protocol] or 'N/A',
        pd_manufacturer = drive.Manufacturer or 'N/A',
        pd_serial_number = drive.SerialNumber or 'N/A',
        pd_model = drive.Model or 'N/A'
    }
    local ok, ret = pcall(sml.pd_get_io_diagnose_info, drive.RefControllerId or common_def.INVALID_U8,
            drive.Protocol or common_def.INVALID_U8, drive.device_id or common_def.INVALID_U8, basic_info)
    if not ok then
        log:error('Failed to get io_diagnose info, and return: %s', ret)
        return
    end
    drive.on_update_io_diagnose_info:emit(ret)
end

local function start_io_deterioration_diagnose_colllect(task_id)
    log:notice('Start io diagnose colllect')
    local task_state = task_mgmt.state
    local task_status = task_mgmt.status
    -- 更新任务状态为启动
    task.update(task_id, task_state.Starting, task_status.OK,
        "CollectIODeteriorationDiagInfoProcess", {string.format('%s', task_id)})

    local drives = drive_collection.get_instance():get_all_drives()
    if next(drives) == nil then
        log:notice("[Storage] Can not find any drives, not Start io diagnose colllect")
        task.update(task_id, task_state.Completed, task_status.OK,
            "CollectIODeteriorationDiagInfoProcess", {string.format('%s', task_id)})
        return
    end

    -- 更新任务状态为运行
    task.update(task_id, task_state.Running, task_status.OK,
            "CollectIODeteriorationDiagInfoProcess", {string.format('%s', task_id)})

    -- 获取所有在位hdd硬盘的IO诊断信息
    for _, drive in pairs(drives) do
        if drive.Presence == 0 or drive.MediaType ~= 0 then
            goto continue
        end
        get_drive_io_deterioration_diagnose_info(drive)
        ::continue::
    end

    -- 更新任务状态为完成
    task.update(task_id, task_state.Completed, task_status.OK,
        "CollectIODeteriorationDiagInfoProcess", {string.format('%s', task_id)})
    log:notice('end io diagnose colllect')
end

function rpc_service_subhealth:collect_io_deterioration_diagnose_info()
    -- 任务已创建，直接返回任务ID
    if self.task_id then
        log:notice('get exist task id = %s', self.task_id)
        return self.task_id
    end

    -- 创建收集任务
    self.task_id = task.create(self.bus, 'StartCollectIODiaginfo', TASK_ATTACH_PATH)
    skynet.fork_once(function()
        start_io_deterioration_diagnose_colllect(self.task_id)
        self.task_id = nil
    end)
    log:notice('create new task id = %s', self.task_id)
    return self.task_id
end

local function get_io_deterioration_info_prop(drive, prop_name)
    if not drive or not drive.Name then
        log:notice('cannot get drive name')
        return
    end

    local data
    if drive.io_diagnose_info and drive.io_diagnose_info[prop_name] then
        if type(drive.io_diagnose_info[prop_name]) == "string" then
            data = drive.io_diagnose_info[prop_name]
        else
            data = tostring(drive.io_diagnose_info[prop_name])
        end
    end

    return data
end

local function get_estimated_lifespan_info_prop(drive, prop_name)
    if not drive or not drive.Name then
        log:notice('cannot get drive name')
        return
    end

    local data
    if drive.estimated_remaining_lifespan_info and drive.estimated_remaining_lifespan_info[prop_name] then
        if type(drive.estimated_remaining_lifespan_info[prop_name]) == "string"then
            data = drive.estimated_remaining_lifespan_info[prop_name]
        else
            data = tostring(drive.estimated_remaining_lifespan_info[prop_name])
        end
    end

    return data
end

-- 获取慢盘检测参数
local function get_drives_io_diagnose_info()
    log:notice('start get drives io diag info')
    local drives = drive_collection.get_instance():get_all_drives()
    if next(drives) == nil then
        log:notice("[Storage] Can not find any drives, not get drives io diag info")
        return {}
    end

    local io_diag_info_lists = {}
    local io_diag_info_list
    for _, drive in pairs(drives) do
        --硬盘在位且介质类型为hdd时做硬盘慢盘检测
        if drive.Presence == 0 or drive.MediaType ~= 0 then
            goto continue
        end
        io_diag_info_list = {}
        for _, io_key_name in pairs(io_key_names) do
            local data = get_io_deterioration_info_prop(drive, io_key_name)
            table.insert(io_diag_info_list, { key = io_key_name, value = data })
        end
        table.insert(io_diag_info_lists, io_diag_info_list)
        ::continue::
    end
    return io_diag_info_lists
end

-- 获取预估寿命参数
local function get_drives_estimatedlifespan_diagnose_info()
    log:notice('start get drives estimatedlifespan diag info')
    local drives = drive_collection.get_instance():get_all_drives()
    if next(drives) == nil then
        log:notice("[Storage] Can not find any drives, not get drives io diag info")
        return {}
    end

    local estimated_lifespan_diag_info_lists = {}
    local estimated_lifespan_diag_info_list
    for _, drive in pairs(drives) do
        if drive.Presence == 0 then
            goto continue
        end
        estimated_lifespan_diag_info_list = {}
        for _, estimated_Lifespan_key_name in pairs(estimated_lifespan_key_names) do
            local data = get_estimated_lifespan_info_prop(drive, estimated_Lifespan_key_name)
            -- data可能为nil，rpc调用方需判断该情况
            table.insert(estimated_lifespan_diag_info_list, { key = estimated_Lifespan_key_name, value = data })
        end
        table.insert(estimated_lifespan_diag_info_lists, estimated_lifespan_diag_info_list)
        ::continue::
    end
    return estimated_lifespan_diag_info_lists
end

function rpc_service_subhealth:get_drives_subhealth_diagnose_info(subhealth_type)
    --慢盘检测参数返回
    if subhealth_type == common_def.SUBHEALTH_STR.IO_DETERIORATION then
        return get_drives_io_diagnose_info()
    end
    --预估寿命参数返回
    if subhealth_type == common_def.SUBHEALTH_STR.ESTIMATED_LIFESPAN then
        return get_drives_estimatedlifespan_diagnose_info()
    end

    return error(custom_error.InvalidValue)
end

local function set_io_diag_result(drive, diag_result)
    if not diag_result then
        log:debug('[Storage]Do not set io_deterioration_healthcode to %s.', drive.Name)
        return
    end

    if diag_result ~= '0' and diag_result ~= '1' then
        log:error('[Storage]The input %s to %s is invalid.', diag_result, drive.Name)
        error({common_def.SM_CODE_PARA_DATA_ILLEGAL, 'IODeteriorationHealthCode', diag_result})
    end
    local health_code = tonumber(diag_result)

    log:notice("set drive:%s healthcode:%s", drive.Name, health_code)
    drive.IODeteriorationHealthCode = health_code
end

local function set_estimated_remaining_lifespan(drive, diag_result)
    if not diag_result then
        log:debug('[Storage]Do not set estimated_remaining_lifespan to %s.', drive.Name)
        return
    end
    local estimated_remaining_lifespan = tonumber(diag_result)

    log:notice("set drive:%s estimatedremaininglifespan:%s", drive.Name, estimated_remaining_lifespan)
    drive.EstimatedRemainingLifespan = estimated_remaining_lifespan
end

local function set_remaining_lifespan_diag_result(drive, diag_result)
    if not diag_result then
        log:debug('[Storage]Do not set remaining_lifespan_insufficient to %s.', drive.Name)
        return
    end

    if diag_result ~= '0' and diag_result ~= '1' then
        log:error('[Storage]The input %s to %s is invalid.', diag_result, drive.Name)
        error({common_def.SM_CODE_PARA_DATA_ILLEGAL, 'EstimatedRemainingLifespanInsufficient', diag_result})
    end
    local lifespan_status = tonumber(diag_result)

    log:notice("set drive:%s lifespanstatus:%s", drive.Name, lifespan_status)
    drive.EstimatedRemainingLifespanInsufficient = lifespan_status
end

function rpc_service_subhealth:set_drive_subhealth_diag_result(ras_subhealth_data_info)
    local disk_subhealth_props = {}
    for _, subhealth_info in pairs(ras_subhealth_data_info) do
        disk_subhealth_props[subhealth_info.Key] = subhealth_info.Value
    end

    if not disk_subhealth_props.DriveName then
        log:error('[Storage]The input info is invalid')
        return
    end
    local drive_name = disk_subhealth_props.DriveName
    local drive = drive_collection:check_and_get_drive(drive_name)
    if not drive then
        log:error("[Storage] Can not find the drive:%s", drive_name)
        return
    end

    --设置慢盘健康码
    set_io_diag_result(drive, disk_subhealth_props.IODeteriorationHealthCode)
    --设置预估剩余寿命
    set_estimated_remaining_lifespan(drive, disk_subhealth_props.EstimatedRemainingLifespan)
    --设置预估剩余寿命告警
    set_remaining_lifespan_diag_result(drive, disk_subhealth_props.EstimatedRemainingLifespanInsufficient)
end

-- 是否正在有日志收集的协程运行
rpc_service_subhealth.log_collect_status = {
    global = false,
    drives = {}
}
function rpc_service_subhealth:start_collect_log(ctx, drive_name)
    local drives = drive_collection.get_instance():get_all_drives()
    if next(drives) == nil then
        log:operation(ctx:get_initiator(), 'storage', 'can not find any drives, end collect log')
        error(base_msg.ActionNotSupported("StartCollectLog"))
    end

    if drive_name == "*" then
        if self.log_collect_status.global or next(self.log_collect_status.drives) ~= nil then
            log:operation(ctx:get_initiator(), 'storage', 'global or single log in progress, skip global collect')
        else
            self.log_collect_status.global = true
            skynet.fork_once(function()
                for _, drive in pairs(drives) do
                    drive:dump_drive_log()
                    drive:record_write_amp_log()
                end
                self.log_collect_status.global = false
            end)
        end
    else
        if self.log_collect_status.global then
            log:operation(ctx:get_initiator(), 'storage', 'global log in progress, skip collect' .. drive_name)
        elseif self.log_collect_status.drives[drive_name] then
            log:operation(ctx:get_initiator(), 'storage', 'single log in progress, skip collect' .. drive_name)
        else
            self.log_collect_status.drives[drive_name] = true
            skynet.fork_once(function()
                for _, drive in pairs(drives) do
                    if drive.Name == drive_name then
                        drive:dump_drive_log()
                        drive:record_write_amp_log()
                    end
                end
                self.log_collect_status.drives[drive_name] = nil
            end)
        end
    end
    log:operation(ctx:get_initiator(), 'storage', 'end collect log')
end

-- 收集成功返回true，收集失败返回false
local function process_log_collection_from_different_ways(ctx, drive_obj)
    local ret
    if drive_obj.Protocol == common_def.DRIVE_PROTOCOL.PCIE and  drive_obj.identify_pd == false then
        local nvme_obj = drive_collection.get_instance():get_nvme_by_drive_name(drive_obj)
        if not nvme_obj then
            log:error("This drive is not nvme disk, drive name is %s", drive_obj.Name)
            return false
        end
        ret = drive_obj:collect_log_from_disk(nvme_obj)
    elseif drive_obj.Protocol == common_def.DRIVE_PROTOCOL.SAS and drive_obj.RefControllerId ~= 255 then
        ret = drive_obj:collect_log_from_raid()
    elseif common_def.MEDIA_TYPE_STR[drive_obj.MediaType] == 'SSD' and drive_obj.Protocol == common_def.DRIVE_PROTOCOL.SAS and drive_obj.RefControllerId == 255 then
        log:notice("This drive is SSD direct attached disk, drive name is %s, bma id is %s", drive_obj.Name, drive_obj.bma_id)
        ret = drive_obj:collect_log_from_bma(ctx)
    else
        log:error("No belong to nvme, raid, or sas ssd direct attached disk, param is %s,%s,%s,%s,%s", drive_obj.Name,
            drive_obj.Protocol, drive_obj.identify_pd, drive_obj.RefControllerId, drive_obj.MediaType)
        return false
    end
    return ret
end

local function collect_single_drive_info(ctx, drive_objs, target_drive)
    local ret
    for _, drive in pairs(drive_objs) do
        if drive.Presence ~= 1 or drive.Name ~= target_drive then
            goto continue
        end
        ret = process_log_collection_from_different_ways(ctx, drive)
        if not ret then
            log:error("Collect io latency data failed, drive name is %s.", drive.Name)
        end
        break
        ::continue::
    end
end

function rpc_service_subhealth:collect_drives_details_task(ctx, drive_name)
    local current_task = self.drives_collect_tasks[drive_name]
    task.update(current_task, task_mgmt.state.Starting, task_mgmt.status.OK,
        "CollectDiagnoseDataProcess", {string.format('%s', current_task)})
    local drives = drive_collection.get_instance():get_all_drives()
    if next(drives) == nil then
        log:error("Can not find drive, stop the task, task id is %s", current_task)
        task.update(current_task, task_mgmt.state.Completed, task_mgmt.status.OK,
            "CollectDiagnoseDataProcess", {string.format('%s', current_task)})
        self.drives_collect_tasks[drive_name] = nil
        return
    end
    collect_single_drive_info(ctx, drives, drive_name)
    task.update(current_task, task_mgmt.state.Completed, task_mgmt.status.OK,
        "CollectDiagnoseDataFinish", {string.format('%s', current_task)})
    self.drives_collect_tasks[drive_name] = nil
    log:notice("Finish collect task, task id is %s, drive name is %s", current_task, drive_name)
end

function rpc_service_subhealth:collect_drives_diagnose_details(ctx, drive_name, presence_flag)
    if self.drives_collect_tasks[drive_name] ~= nil and presence_flag == 1 then
        log:notice("Already created task, task id is %s", self.drives_collect_tasks[drive_name])
        return self.drives_collect_tasks[drive_name]
    end
    local new_task = task.create(self.bus, 'CollectDiagnoseData', TASK_ATTACH_PATH)
    skynet.fork_once(function()
        self.drives_collect_tasks[drive_name] = new_task
        log:operation(ctx:get_initiator(), 'storage', 'start collect io latency data')
        self:collect_drives_details_task(ctx, drive_name)
    end)
    log:notice("Create collect task, task id is %s, drive name is %s", new_task, drive_name)
    return new_task
end

-- data_type中指定要收集的类型
function rpc_service_subhealth:collect_drive_diagnose_data(obj_path, ctx, data_type)
    local ok, result = pcall(mdb.get_object, self.bus, obj_path, 'bmc.kepler.Systems.Storage.Drive')
    if not ok or not result then
        log:error("Can not get path obj, path = %s", obj_path)
        error(custom_error.InvalidValue)
    end

    if not data_type or #data_type == 0 then
        log:notice("Input data_type is invalid, not collect drive diagnose data")
        error(custom_error.InvalidValue)
    end
    local io_latency_type
    for _, type in pairs(data_type) do
        if type == common_def.SUBHEALTH_STR.VENDOR_IO_LATENCY then
            io_latency_type = type
            break
        end
    end
    if io_latency_type then
        return self:collect_drives_diagnose_details(ctx, result.Name, result.Presence)
    end
    return error(custom_error.InvalidValue)
end

function rpc_service_subhealth:get_drive_diagnose_data(obj_path, ctx, data_type)
    local ok, result = pcall(mdb.get_object, self.bus, obj_path, 'bmc.kepler.Systems.Storage.Drive')
    if not ok or not result then
        log:error("Can not get path obj, path = %s", obj_path)
        error(custom_error.InvalidValue)
    end

    if not data_type or #data_type == 0 then
        log:error("Input data_type is invalid, not collect drive diagnose data")
        error(custom_error.InvalidValue)
    end
    local io_latency_type
    for _, type in pairs(data_type) do
        if type == common_def.SUBHEALTH_STR.VENDOR_IO_LATENCY then
            io_latency_type = type
            break
        end
    end
    if io_latency_type then
        local drive_obj = drive_collection.get_instance():get_drive(result.Name)
        if not drive_obj then
            log:error("This drive is not exist, drive name is %s", result.Name)
            return error(custom_error.InvalidValue)
        end
        local data = {}
        data[common_def.SUBHEALTH_STR.VENDOR_IO_LATENCY] = drive_obj.io_latency
        return data
    end
    return error(custom_error.InvalidValue)
end

return singleton(rpc_service_subhealth)