Python, pandas and xlsxwriter, VMware RVTool Export .xlsx files – Mask sensitive information

Scenario: Need to export data using RVTool for Private cloud DC migration to Azure/AWS Cloud. I have been asked to mask any sensitive information like host, hostnames, IP Addresses etc. Provide the information for scoping the migration work as the initial steps.

  1. To develop this, we need to make a tool to read all .xlsx files and read and print the names of each tabs.
import pandas as pd
import os

# Set the file path
file_path = 'C:\\Python312\\RVTool_masker\\'

# Function to read and print sheet names
def read_excel_files(file_path):
    # Get all files in the directory
    files = os.listdir(file_path)
    
    # Loop through each file
    for file in files:
        # Check if the file is an Excel file
        if file.endswith('.xlsx'):
            print("File:", file)
            # Read the Excel file
            xls = pd.ExcelFile(os.path.join(file_path, file))
            # Get sheet names
            sheet_names = xls.sheet_names
            # Print sheet names
            print("Sheet names:")
            for sheet_name in sheet_names:
                print(sheet_name)
            print("\n")

# Call the function
read_excel_files(file_path)

2. Now, use pandas to read and mask sensitive information in the files with *** and ###. use xlsxwriter to write to new excel files.

import pandas as pd
import os

# Set the file path
file_path = 'C:\\Python312\\RVTool_masker\\'

# Function to mask VM names and IP addresses and save to separate files
def mask_vm_names(file_path):
    # Function to mask VM names
    def mask_vm_name(vm_name):
        if isinstance(vm_name, str) and len(vm_name) > 6:
            return vm_name[:3] + '*' * (len(vm_name) - 6) + vm_name[-3:]
        else:
            return vm_name

    # Function to mask IP addresses
    def mask_ip_address(ip_address):
        if isinstance(ip_address, str) and len(ip_address.split('.')) == 4:
            octets = ip_address.split('.')
            return octets[0] + '.' + octets[1] + '.###.###'
        else:
            return ip_address

    # Get all files in the directory
    files = os.listdir(file_path)
    
    # Loop through each file
    for file in files:
        # Check if the file is an Excel file
        if file.endswith('.xlsx'):
            print("Processing file:", file)
            # Read the Excel file
            xls = pd.ExcelFile(os.path.join(file_path, file))
            
            # Create a writer to save masked data
            writer = pd.ExcelWriter(os.path.join(file_path, file.replace('.xlsx', '_masked.xlsx')), engine='xlsxwriter')
            
            # Loop through each sheet
            for sheet_name in xls.sheet_names:
                # Read the sheet
                df = pd.read_excel(xls, sheet_name)
                
                # Check if specific columns exist, otherwise find the first string column
                target_columns = ['VM', 'Name', 'Host', 'Port', 'Switch']  # Define your target columns here
                vm_columns = [col for col in df.columns if any(target in col for target in target_columns)]
                if not vm_columns:
                    for col in df.columns:
                        if df[col].dtype == 'object':
                            vm_columns = [col]
                            break
                
                # Mask VM names
                for vm_column in vm_columns:
                    df[vm_column] = df[vm_column].apply(mask_vm_name)
                
                # Mask IP addresses
                if 'Primary IP Address' in df.columns:
                    df['Primary IP Address'] = df['Primary IP Address'].apply(mask_ip_address)
                
                # Write masked data to the writer
                df.to_excel(writer, sheet_name=sheet_name, index=False)
            
            # Save the masked data to a new file
            writer.close()
            print("Saved masked data to:", file.replace('.xlsx', '_masked.xlsx'))

# Call the function
mask_vm_names(file_path)

3. Reiterate to mask “Customer” colunm to replace with X’s

import pandas as pd
import os

# Set the file path
file_path = 'C:\\Python312\\RVTool_masker\\'

# Function to mask VM names, IP addresses, and customer names and save to separate files
def mask_vm_names(file_path):
    # Function to mask VM names
    def mask_vm_name(vm_name):
        if isinstance(vm_name, str) and len(vm_name) > 6:
            return vm_name[:3] + '*' * (len(vm_name) - 6) + vm_name[-3:]
        else:
            return vm_name

    # Function to mask IP addresses
    def mask_ip_address(ip_address):
        if isinstance(ip_address, str) and len(ip_address.split('.')) == 4:
            octets = ip_address.split('.')
            return octets[0] + '.' + octets[1] + '.###.###'
        else:
            return ip_address
    
    # Function to mask customer names
    def mask_customer_name(customer_name):
        if isinstance(customer_name, str) and len(customer_name) > 3:
            return customer_name[:2] + 'X' * (len(customer_name) - 3) + customer_name[-1]
        else:
            return customer_name

    # Get all files in the directory
    files = os.listdir(file_path)
    
    # Loop through each file
    for file in files:
        # Check if the file is an Excel file
        if file.endswith('.xlsx'):
            print("Processing file:", file)
            # Read the Excel file
            xls = pd.ExcelFile(os.path.join(file_path, file))
            
            # Create a writer to save masked data
            writer = pd.ExcelWriter(os.path.join(file_path, file.replace('.xlsx', '_masked.xlsx')), engine='xlsxwriter')
            
            # Loop through each sheet
            for sheet_name in xls.sheet_names:
                # Read the sheet
                df = pd.read_excel(xls, sheet_name)
                
                # Check if specific columns exist, otherwise find the first string column
                target_columns = ['VM', 'Name', 'Host', 'Port', 'Switch']  # Define your target columns here
                vm_columns = [col for col in df.columns if any(target in col for target in target_columns)]
                if not vm_columns:
                    for col in df.columns:
                        if df[col].dtype == 'object':
                            vm_columns = [col]
                            break
                
                # Mask VM names
                for vm_column in vm_columns:
                    df[vm_column] = df[vm_column].apply(mask_vm_name)
                
                # Mask IP addresses
                if 'Primary IP Address' in df.columns:
                    df['Primary IP Address'] = df['Primary IP Address'].apply(mask_ip_address)
                
                # Mask customer names
                if 'Customer' in df.columns:
                    df['Customer'] = df['Customer'].apply(mask_customer_name)
                
                # Write masked data to the writer
                df.to_excel(writer, sheet_name=sheet_name, index=False)
            
            # Save the masked data to a new file
            writer.close()
            print("Saved masked data to:", file.replace('.xlsx', '_masked.xlsx'))

# Call the function
mask_vm_names(file_path)

Leave a comment