-- Copyright (c) 2024 Huawei Technologies Co., Ltd.
-- openUBMC is licensed under Mulan PSL v2.
-- You can use this software according to the terms and conditions of the Mulan PSL v2.
-- You may obtain a copy of Mulan PSL v2 at: http://license.coscl.org.cn/MulanPSL2
-- THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND,
-- EITHER EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT,
-- MERCHANTABILITY OR FIT FOR A PARTICULAR PURPOSE.
-- See the Mulan PSL v2 for more details.

local singleton = require 'mc.singleton'
local skynet = require 'skynet'
local skynet_queue = require 'skynet.queue'
local ipmi = require 'ipmi'
local fructl = require 'gpu_service.fructl_handler'
local comp_code = ipmi.types.Cc
local file_sec = require 'utils.file'
local utils = require 'mc.utils'
local utils_core = require 'utils.core'
local gpu_object = require 'gpu_service.gpu_object'
local vastai_gpu_object = require 'gpu_service.vastai_gpu_object'
local class = require 'mc.class'
local mdb = require 'mc.mdb'
local log = require 'mc.logging'

local gpu_service = class(nil, nil, true)

local HOST_AGENT_APP = 'bmc.kepler.host_agent'
local GPU_PATH = '/bmc/kepler/Systems/1/Sms/1/ComputerSystem/Systems/1/GPUs'
local VASTAI_MANU <const> = "Vastai Technologies"

function gpu_service:ctor(bus)
    self.bus = bus
    self.load_task_start = false
    self.gpu_collection = {}
    self.gpu_metrics = nil
    self.queue = skynet_queue()
    self.smbus_postbox_task_start = false
end

function gpu_service:init()
end

function gpu_service:on_add_object(class_name, object, position)
    local switch = {
        ['GPU'] = function ()
            self:add_object(object, position)
            object.property_changed:on(function(prop, value)
                if prop == 'PowerWatts' then
                    self.queue(function()
                        self:update_total_power()
                    end)
                end
            end)
        end,
        ['GPUs'] = function()
            self.gpu_metrics = object
            self:update_total_power()
        end
    }

    if switch[class_name] then
        switch[class_name]()
        log:info('[gpu_service] Add object, Class: %s', class_name)
    end
end

local last_power_status = 'ON'

function gpu_service:add_object(obj, position)
    local gpu
    if obj.Manufacturer == VASTAI_MANU then
        gpu = vastai_gpu_object.new(obj)
    else
        gpu = gpu_object.new(obj)
    end

    table.insert(self.gpu_collection, gpu)
    if self.load_task_start then
        return
    end
    self.load_task_start = true
    skynet.fork_once(function()
        while true do
            self:start_load()
            skynet.sleep(200)
        end
    end)

    if self.smbus_postbox_task_start then
        return
    end
    self.smbus_postbox_task_start = true
    local power_status
    skynet.fork_loop({count = 0}, function()
        local sysid = gpu.gpu.SystemId
        while true do
            power_status = fructl.get_power_status(self.bus, sysid)
            if power_status == "ON" and
                last_power_status ~= power_status then
                skynet.sleep(3000)
            elseif power_status == "ON" and
                last_power_status == power_status then
                self:start_action_task()
            else
                self:stop_action_task()
                self:clear_info()
                skynet.sleep(3000)
            end
            last_power_status = power_status
            skynet.sleep(3000)
        end
    end)
end

function gpu_service:update_total_power()
    if self.gpu_metrics then
        local total_power = 0
        for _, obj in pairs(self.gpu_collection) do
            if not obj.gpu or not obj.gpu['PowerWatts'] then
                log:error("get powerwatts failed.")
            else
                log:debug("GPU[%s].PowerWatts change to %s", obj.Name, obj.gpu['PowerWatts'])
                total_power = total_power + obj.gpu['PowerWatts']
            end
        end
        self.gpu_metrics['ConsumedPowerWatt'] = total_power
    end
end

local function remove_ele_by_position(list, position)
    for i, obj in pairs(list) do
        if obj.position == position then
            table.remove(list, i)
            break
        end
    end
end

function gpu_service:on_delete_object(class_name, mds_obj, position)
    local switch = {
        ['GPU'] = function()
            remove_ele_by_position(self.gpu_collection, position)
        end,
        ['GPUs'] = function()
            log:notice('Delete object, class: %s, position: %s', class_name, position)
            self.gpu_metrics = nil
        end
    }

    if switch[class_name] then
        log:notice('Delete object, class: %s, position: %s', class_name, position)
        switch[class_name]()
    end
end

function gpu_service:get_sub_objs(path)
    for _ = 1, 3 do
        local err, objects = self.bus:pcall(HOST_AGENT_APP, path,
            'org.freedesktop.DBus.ObjectManager', 'GetManagedObjects', '')
        if not err and objects then
            return objects
        end
        skynet.sleep(100)
    end
end

function gpu_service:load_gpu_summary(path, gpu_obj)
    local summary_objects = self:get_sub_objs(path)
    if not summary_objects then
        return
    end

    for _, o in pairs(self.gpu_collection) do
        o:update_summary_info({
            GPUResource = gpu_obj,
            Summary = summary_objects
        })
    end
end

function gpu_service:start_load()
    pcall(self.load_gpu_objects, self)
end

function gpu_service:update_info_via_bma()
    local objects = self:get_sub_objs(GPU_PATH)
    if not objects then
        return
    end
    for sub_path, object in pairs(objects) do
        self:load_gpu_summary(sub_path, object)
    end
end

function gpu_service:update_default_info()
    for _, o in pairs(self.gpu_collection) do
        o:update_summary_default()
    end
end

function gpu_service:load_gpu_objects()
    local ok_sms_status, sms_status_obj =
        pcall(mdb.get_object, self.bus, '/bmc/kepler/Systems/1/Sms', 'bmc.kepler.Systems.Sms.SmsStatus')
    local ok_sms, sms_obj =
        pcall(mdb.get_object, self.bus, '/bmc/kepler/Systems/1/Sms', 'bmc.kepler.Systems.Sms')

    if not ok_sms_status or not ok_sms then
        return
    end

    if sms_status_obj.State == 0 and sms_obj.Registered == true then
        self:update_info_via_bma()
    else
        self:update_default_info()
    end
end

function gpu_service:clear_info()
    for _, o in pairs(self.gpu_collection) do
        o:clear_properties()
    end
end

function gpu_service:start_action_task()
    log:info("[general_hardware] gpu start_action_task")
    for i, o in pairs(self.gpu_collection) do
        log:info("[general_hardware] gpu start_action_task %s ", i)
        o:start_action_task()
    end
end

function gpu_service:stop_action_task()
    for _, o in pairs(self.gpu_collection) do
        o:stop_action_task()
    end
end

function gpu_service:get_gpu_model(b_num, d_num, f_num)
    log:info('get_gpu_model b = %s, d = %s, f = %s', b_num, d_num, f_num)
    local processor_info, path
    local service = 'bmc.kepler.pcie_device'
    local interface = 'bmc.kepler.Systems.PCIeDevices.PCIeDevice'
    for _, v in ipairs(self.gpu_collection) do
        processor_info = v:get_processor_info()
        path = string.format('/bmc/kepler/Systems/%d/PCIeDevices/PCIeDevice_%s_%s',
            processor_info.SystemId, processor_info.ObjectIdentifier[2], processor_info.ObjectIdentifier[4])
        log:info('get_gpu_model path is %s', path)
        local err, pcie_device_obj = self.bus:pcall(service, path,
            'org.freedesktop.DBus.Properties', 'GetAll', 's', interface)
        if not err then
            local bus = pcie_device_obj['Bus']:value()
            local device = pcie_device_obj['Device']:value()
            local func = pcie_device_obj['Function']:value()
            if bus == b_num and device == d_num and func == f_num then
                return processor_info.Model, comp_code.Success
            end
        else
            log:debug('get pcie_device obj fail' )
            return '', comp_code.ParmOutOfRange
        end
    end
    return '', comp_code.ParmOutOfRange
end

function gpu_service:on_dump_gpu_info(file, obj_set, header_name)
    file:write(header_name)
    local buff = string.format(
        '%-4s | %-6s | %-2s | %-20s | %-20s | %-16s | %-16s | %-10s | %-10s | %-10s | %-10s | %-4s | %s\n',
        'Slot', 'BoardId', 'GPUType', 'Name', 'Manufacturer', 'SerialNum', 'FirmVer',
        'Capab0', 'Capab1', 'Capab2', 'Capab3', 'SBE', 'DBE')
    for _, v in pairs(obj_set) do
        if string.find(string.upper(v.gpu.Manufacturer), 'NVIDIA') then
            buff = buff .. string.format(
            '%u | 0x%-5x | %u | %-20s | %-20s | %-16s | %-16s | 0x%-8x | 0x%-8x | 0x%-8x | 0x%-8x | %u | %u',
            v.gpu.Slot, 0, 0, v.gpu.Name, v.gpu.Manufacturer, v.gpu.SerialNumber, v.gpu.FirmwareVersion, 
                v.Capability0, v.Capability1, v.Capability2, v.Capability3, v.gpu.SingleBitErrorPageCount, 
                v.gpu.DoubleBitErrorPageCount)
        else
            buff = buff .. string.format(
                '%-4u | 0x%-5x | %-7u | %-20s | %-20s | %-16s | %s',
                v.gpu.Slot, 0, 0, v.gpu.Name, v.gpu.Manufacturer, v.gpu.SerialNumber, v.gpu.FirmwareVersion)
        end
        buff = buff .. '\n'
    end
    buff = buff .. '\r\r'
    file:write(buff)
    log:notice('%s collect finish', header_name)
end

function gpu_service:on_dump_cb(ctx, dest_path)
    if next(self.gpu_collection) == nil then
        return
    end
    local file_name = dest_path .. '/card_info'
    local file, err = file_sec.open_s(file_name, 'a+')
    if not file then
        log:error('open file failed, err: %s', err)
        return
    end
    utils.safe_close_file(file, function()
        utils_core.chmod_s(file_name, utils.S_IRUSR | utils.S_IWUSR | utils.S_IRGRP)
        pcall(self.on_dump_gpu_info, self, file, self.gpu_collection, 'GPU Card Info\n')
    end)
end

return singleton(gpu_service)