RL_model_decision_functions.py#

"""
Functions for converting counts to probabilities, evaluating agent decision and outcomes
"""

# from utils import *
import random
from random import choices


#########################
## softmax probability ##
#########################

def softmax(x, beta = 1):
    """Compute softmax values for each sets of scores in x."""
    return np.exp(x * beta) / np.sum(np.exp(x * beta), axis=0)

def get_softmax_probabilities(df, columns):
    """
    create a softmax dataframe to store the probabilities for choosing
    rock, paper, or scissors move.
    This general sofm function is used for human_reward_move model.
    """
    vals = df[columns]
    vals = vals.apply(softmax,axis=1,beta=4)
    return vals

def get_softmax_probabilities_3b(df):
    """
    create a softmax dataframe to store the probabilities for choosing
    rock, paper, or scissors move.
    This sofm function is used for human_reward_past_cur_move model.
    """
    distribution = []
    # assign vals to store reward list
    # Since first round has no previous moves, we add even probability to it
    vals=[[0.33,0.33,0.33]]
    for i in range(2,df.shape[0],2):
        # agent's previous move
        pre_move=df.get('player_move').iloc[i-2]
        # avoid Nan pre_move
        if pre_move != 'none' and not pd.isna(pre_move):
            # get the reward combination with specific pre_move
            reward_cols=[f'{pre_move}_rock_reward',f'{pre_move}_paper_reward',f'{pre_move}_scissors_reward']
            val = df[reward_cols].iloc[i].tolist()
            vals.append(val) # append reward value to vals
        else:
            # if there's no pre_move, use the last reward list (val)
            val=vals[-1]
            vals.append(val)
    # convert reward list to softmax probability
    soft_max=[softmax(x) for x in vals]
    # create a softmax distribution dataframe
    sofm = pd.DataFrame(soft_max, columns = ['softmax_prob_rock', 'softmax_prob_paper', 'softmax_prob_scissors'])

    # only human rows
    df_new = df[df.is_bot == 0].reset_index()
    # concate human rows and softmax distribution
    df_new = pd.concat([df_new,sofm], axis = 1)
    return df_new

def get_softmax_probabilities_3c(df):
    '''
    generate softmax probability distribution of each round so we can sample moves from the distribution
    '''
    # df.dropna(axis = 0)
    distribution = []
    vals=[[0.33,0.33,0.33]] # has deleted one default prob list
    for i in range(2,df.shape[0],2):
        pre_move=df.get('player_move').iloc[i-1] # -1 instead of -2 since opponent_pre
        if pre_move != 'none' and not pd.isna(pre_move):
            reward_cols=[f'opponent_{pre_move}_rock_reward',f'opponent_{pre_move}_paper_reward',f'opponent_{pre_move}_scissors_reward']
            val = df[reward_cols].iloc[i].tolist()
            vals.append(val)
        else:
            val=vals[-1]
            vals.append(val)
    soft_max=[softmax(x) for x in vals]
    sofm = pd.DataFrame(soft_max, columns = ['softmax_prob_rock', 'softmax_prob_paper', 'softmax_prob_scissors'])

    # strip only human df outside of the function
    df_new = df[df.is_bot == 0].reset_index()
    df_new = pd.concat([df_new,sofm], axis = 1)
    return df_new

def get_softmax_probabilities_mix(df_agent_past,df_opponent_past):
    """
    choose human_reward_past or opponent_reward_past based on which strategy has higher reward
    """
    
    distribution = []
    vals=[[0.33,0.33,0.33]] # has deleted one default prob list
    for i in range(2,max(df_agent_past.shape[0],df_agent_past.shape[0]),2):
        oppo_pre_move=df_agent_past.get('player_move').iloc[i-1]
        agent_pre_move=df_agent_past.get('player_move').iloc[i-2]
        
        if agent_pre_move != 'none' and oppo_pre_move!= 'none' and not pd.isna(oppo_pre_move) and not pd.isna(agent_pre_move):
            agent_reward_cols=[f'{agent_pre_move}_rock_reward',
                         f'{agent_pre_move}_paper_reward',
                         f'{agent_pre_move}_scissors_reward'] #df_agent_past only has bot 0
            oppo_reward_cols=[f'opponent_{oppo_pre_move}_rock_reward',
                         f'opponent_{oppo_pre_move}_paper_reward',
                         f'opponent_{oppo_pre_move}_scissors_reward']# df_opponent_past has bot=0&1,so index not match
            val_agent=df_agent_past[agent_reward_cols].iloc[i].tolist()
            val_oppo = df_opponent_past[oppo_reward_cols].iloc[i].tolist()
            if sum(val_agent)>sum(val_oppo):
                val=val_agent
            else:
                val=val_oppo
            vals.append(val)
        else:
            val=vals[-1]
            vals.append(val)
            
    soft_max=[softmax(x) for x in vals] 
    sofm = pd.DataFrame(soft_max, columns = ['softmax_prob_rock', 'softmax_prob_paper', 'softmax_prob_scissors'])
    
    # strip only human df outside of the function
    df_new = df_agent_past[df_agent_past.is_bot == 0].reset_index() 
    df_new = pd.concat([df_new,sofm], axis = 1)
    return df_new

def get_softmax_probabilities_combined(df):
    # df.dropna(axis = 0)
    distribution = []
    vals=[[0.33,0.33,0.33]] # has deleted one default prob list
    for i in range(2,df.shape[0],2):
        oppo_pre_move=df.get('player_move').iloc[i-1]
        agent_pre_move=df.get('player_move').iloc[i-2]
        
        if agent_pre_move != 'none' and not pd.isna(oppo_pre_move) and not pd.isna(agent_pre_move):
            reward_cols=[f'opponent_{oppo_pre_move}_{agent_pre_move}_rock_reward',\
                         f'opponent_{oppo_pre_move}_{agent_pre_move}_paper_reward',\
                         f'opponent_{oppo_pre_move}_{agent_pre_move}_scissors_reward']
            val = df[reward_cols].iloc[i].tolist()
            vals.append(val)
        else:
            val=vals[-1]
            vals.append(val)
    soft_max=[softmax(x) for x in vals] 
    sofm = pd.DataFrame(soft_max, columns = ['softmax_prob_rock', 'softmax_prob_paper', 'softmax_prob_scissors'])
    
    # strip only human df outside of the function
    df_new = df[df.is_bot == 0].reset_index()
    df_new = pd.concat([df_new,sofm], axis = 1)
    return df_new


#############################
### Move choice functions ###
#############################

def pick_move(df, sofm):
    """
    pick agent move based of the softmax probability distribution
    """
    moves = np.array([])
    for i in range(df.shape[0]):
        move_choices = ['rock', 'paper', 'scissors']
        distribution = sofm.iloc[i].tolist()
        chosen_move = choices(move_choices, distribution)
        moves = np.append(moves, chosen_move)
    df = df.assign(agent_move = moves)
    return df

def pick_move_v2(df):
    moves = np.array([])
    for i in range(df.shape[0]):
        move_choices = ['rock', 'paper', 'scissors']
        distribution = df[['softmax_prob_rock', 'softmax_prob_paper', 'softmax_prob_scissors']].iloc[i].tolist() # get ith [rock_prob,paper_prob,scissors_prob] from input df
        chosen_move = random.choices(move_choices, distribution)
        moves = np.append(moves, chosen_move)
    df = df.assign(agent_move = moves) # agent_move stores sampled moves
    return df

def assign_agent_outcomes(df):
    """
    Assign outcomes for the agent based on agent move choices.
    df should include only human rows, since agent outcomes are irrelevant for simulating bots
    """
    df.assign(agent_outcome = '')
    df=df.assign(agent_outcome=df.apply(lambda x: evaluate_outcome(x['agent_move'], x['opponent_move']), axis=1))
    return df