我有一个Python脚本,它的意思是导入一个CSV文件,删除某些列,添加两列,然后根据一些变量确定一个人是否符合竞选资格。基本上,简单的数据操作/准备,然后导出结果。
一切都按预期工作,除了它始终跳过第一行数据(标题行之后的行)。
我一直没能理解“为什么”,但我可以证明它发生了。
如果我在标题行和我的数据之间手动插入一个
null
行,一切都会按预期运行。如果我把第一行放在我的 CSV 文件的末尾,所有数据导入(包括最后一行 - 我这样做是为了确认我的数据中是否有某些东西导致过早的行返回或东西)。
任何关于从哪里开始故障排除的建议将不胜感激。据我所知,我的 for 循环只是从文件的第 3 行而不是第 2 行开始。
上述 Python 文件 应该 导出在
Video
列中具有 Type
值且在日期参数范围内的所有数据行。相反,即使有 Video
作为 Type
并且它落在日期参数内,第一行数据(文件中的第二行)也会被跳过。
编辑 将代码添加到这篇文章。对不起!
# Data cleansing for apify data import
import csv
import chardet
from datetime import datetime
# init variables
# This is the date format for the data downloaded from Apify
# It is used to convert the text string into a true date value before exporting to CSV
#date_format = '%Y-%m-%dT%H:%M:%S.%fZ'
date_format = '%Y-%m-%d %H:%M:%S'
# This is the start date for the timeframe that you are performing the data
# clean-up for. The format is (yyyy,m,d,h,m,s)
# This variable WILL needed to be updated each year
start_dttm = datetime(2023,1,1,0,0,0,1)
# This is the end date for the timeframe you are performing the data
# clean-up for. The format is (yyyy,m,d,h,m,s)
# This variable WILL needed to be updated each year
end_dttm = datetime(2023,5,31,23,59,59)
# This variable is used to return whether the user used the proper
# hashtag to qualify/be counted toward the campaign (e.g. in 2022 the campaign
# was The Witcher)
camp_elig = False
# This is the file name for the initial input file downloaded from Apify
# it needs to match exactly the file.
# input_file = 'inputstm2022.csv'
input_file = 'dataset_instagram-hashtag-scraper_2023-04-17_14-26-18-399.csv'
# This is the folder path where the Apify export file resides as well
# as the location of where the output csv will be
# file_path = 'D:\\consulting\\AudreyHelpsActors\\'
file_path = 'D:\\Nextcloud\\Consulting\\selfTapeMay\\'
# This is the full path of the output file
output_csv = file_path + 'output.csv'
# This is the full path of the input file
input_csv = file_path + input_file
# Determine encoding
def determine_encoding(file_path):
with open(file_path, 'rb') as file:
result = chardet.detect(file.read())
return result['encoding']
# Before the csv file is imported for manipulation
# add a column to the file at the end. This column
# is intended to store a Y/N flag for if the user participated
# in a particular campaign during Self Tape May. For example:
# in 2022 the campaign was "Witcher sides". If the user leveraged
# a hashtag mentioning The Witcher they should have a 'Y' in this column
def add_campaign_col(in_file, encoding):
# Open the CSV file for reading
with open(in_file, 'r', encoding=encoding) as file:
# Read the contents of the CSV file into a list of rows
reader = csv.reader(file)
rows = list(reader)
# Add a header for the new column to the first row
rows[0].append('campaignFlag')
rows[0].append('locationName')
rows[0].append('_id')
rows[0].append('_createdDate')
rows[0].append('_updatedDate')
rows[0].append('_owner')
# Open the CSV file for writing
with open(in_file, 'w', newline='', encoding=encoding) as file:
# Write the updated list of rows to the CSV file
writer = csv.writer(file)
writer.writerows(rows)
return
# Identify the columns in the csv file that include the word
# `hashtag` in their name. This will be the starting point for me to
# loop through the `hashtag` columns to see if they used the campaign
# hashtag (e.g. `thewitcher`).
def hashtag_columns(csv_reader):
header_row = next(csv_reader)
all_cols = []
search_string = "hashtag"
hashtag_cols = ['ownerUsername']
# Identify all the headers
for header in header_row:
all_cols.append(header)
# Figure out which headers start with `hashtag` and create a list
# Be sure to make the first list value 'ownerUsername'
for header in all_cols:
if search_string in header:
hashtag_cols.append(header)
return hashtag_cols
# Loop through every column that has the `hashtag` in its name
# and see if the campaign hashtag(s) were mentioned
def campaign_check(columns, userName, dataRow):
# This variable identifies the campaign hashtag.
# Current assumption is there will only be one campaign hashtag
# at a time.
#camp_hashtag = "selftapemaylotr"
camp_hashtag = "selftapemay2023"
# Loop through all of the items in this dictionary
# and check to see if the user leveraged the campaign hashtag
# If they did use it than the function should return a 'True' value. Otherwise
# the function should return a 'False' value.
for key, value in dataRow.items():
if value == camp_hashtag:
return True
return False
# Determine the encoding type of the csv file
encoding = determine_encoding(input_csv)
# Add the 'campaignFlag' to the source CSV file
add_campaign_col(input_csv, encoding)
# List of column names to keep
columns_to_keep = ['id','locationName','ownerFullName','ownerUsername','timestamp', 'type', 'videoDuration', 'campaignFlag', '_id', '_createdDate', '_updatedDate', '_owner']
# print("---BEGIN---")
# Read the input CSV file
with open(input_csv, 'r', encoding=encoding) as input_file:
reader = csv.DictReader(input_file)
#header = next(reader) # read the first row as header
hash_columns = hashtag_columns(reader)
# Write the output CSV file
with open(output_csv, 'w', newline='', encoding=encoding) as output_file:
columns_to_write = columns_to_keep
writer = csv.DictWriter(output_file, fieldnames=columns_to_write)
# Write the header row
writer.writeheader()
# ******************************************
# DEBUGGING THE READER DICTIONARY
# ******************************************
# Iterate over the dictionary and print each key-value pair
# for row in reader:
# for key, value in row.items():
# print(key, ':', value)
# print() # Add an extra line between rows
# ******************************************
# END DEBUGGING CODE
# ******************************************
# Write the data rows, keeping only the specified columns
for i, row in enumerate(reader):
#print("i: ")
#print(i)
#print("---row---")
#print(row)
#print("row userName: " + row['ownerUsername'])
# Read in the row of data to output
output_row = {key: row[key] for key in columns_to_keep}
# Call the campaign check function to see if users participated in the campaign
camp_check = campaign_check(hash_columns, output_row['ownerUsername'], {key: row[key] for key in hash_columns})
# Add the '@' symbol to the IG handle to match user profile data
# at selftapemay.com
output_row['id'] = str(output_row['id'])
output_row['ownerUsername'] = '@' + output_row['ownerUsername']
output_row['_id'] = output_row['id']
output_row['_owner'] = output_row['ownerUsername']
# Based on the results returned from campaign_check set the value accordingly
if camp_check:
output_row['campaignFlag'] = 'Y'
else:
output_row['campaignFlag'] = 'N'
# Convert date/time string to formatted date/time stamp
if output_row['timestamp'] != '':
# *****************************
# BEGIN TIMESTAMP CONVERSION
# *****************************
timestamp_str = output_row['timestamp']
# Parse the string into a datetime object
timestamp_dt = datetime.strptime(timestamp_str, '%Y-%m-%dT%H:%M:%S.%fZ')
print(timestamp_dt)
# Convert the datetime object to the desired format
bigquery_timestamp_str = timestamp_dt.strftime('%Y-%m-%d %H:%M:%S.%f UTC')
#print(bigquery_timestamp_str) # Output: 2023-02-07 17:13:20.000000 UTC
# *****************************
# END TIMESTAMP CONVERSION
# *****************************
#date_string = output_row['timestamp']
#dt = datetime.strptime(date_string, date_format)
output_row['timestamp'] = timestamp_dt
output_row['_createdDate'] = timestamp_dt
output_row['_updatedDate'] = timestamp_dt
# Write out data to CSV file
if start_dttm <= timestamp_dt <= end_dttm and output_row['type'] == 'Video':
writer.writerow(output_row)