# pandas

官方文档: https://pandas.pydata.org/docs/ (opens new window)

import pandas as pd
import numpy as pd

# 读取数据

csv: https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html#pandas.read_csv (opens new window)

excel: https://pandas.pydata.org/docs/reference/api/pandas.read_excel.html#pandas.read_excel (opens new window)

"""读取csv文件"""
df = pd.read_csv(filename, sep=",", header=0, index_col=None, encoding="utf-8")
"""读取excel文件"""
df = pd.read_excel(filename, sheet_name=0, header=0, index_col=None, encoding="utf-8")
"""迭代读取tsv文件 (大文件/有错误行)"""
for chunk in pd.read_csv(filename, header=0, sep="\t", encoding="utf-8", error_bad_lines=False, iterator=True, chunksize=chunk_size):
    pass

# 存储数据

"""存储为csv文件"""
df.to_csv(filename, sep=',', index=True, columns=None, header=True)
"""存储为excel文件"""
DataFrame.to_excel(filename, index=True, sheet_name='Sheet1', columns=None, header=True)

# 数据转换

"""df 2 json"""
df.to_json(orient="records")
"""df 2 list"""
df.values.tolist()
"""series 2 dict"""
series.to_dict()
"""series 2 list"""
series.tolist()
"""json_list/dict_list 2 df"""
pd.DataFrame(list)

# DataFrame 操作

# 基本操作 loc iter apply copy nan

"""按照index定位"""
df.loc[index]
df.loc[index, col]
"""按照行定位"""
df.iloc[row]
df.iloc[row, col]
"""按行进行迭代"""
for index, row in df.iterrows():
    pass
"""系列转换"""
def func(x, a):
    pass
series.apply(func, args=(a,))
def func(x, **kwargs):
    x += kwargs["a"]
    pass
series.apply(func, a=1, b=2, c=3)
"""深拷贝"""
df_copy = df.copy(deep=True)
"""空值处理"""
df.isnull().sum() # 统计空值数目
df.fillna(0)      # 填充空值为0
"""统计数目"""
series.value_counts()

# 筛选 filter

"""筛选符合某条件的数据"""
df[df[col]==condition]
"""筛选不符合某条件的数据"""
df[~df[col]==condition]
"""筛选在列表中的数据"""
df[df[col].isin(mylist)]
"""筛选符合多重条件 &-且 |-或"""
df[(df[col]==condition1) & (df[col]==condition2)]
"""筛选包含字符串"""
df[df[col].str.findall(pattern).astype("bool")]

# 排序 sort

df.sort_values(by=[c1, c2], ascending=[True, False], na_position="last")

# 拼接 merge concat

Blog: https://www.cnblogs.com/keye/p/10791705.html (opens new window)

"""基于共同列拼接
on (str): 用来合并的列名
how (str): 数据合并的方式 left-基于左df列的数据合并 right-基于右df列的数据合并 outer-取并集 inner-取交集
suffixes (str): 若有相同列且该列没有作为合并的列,设置相应的后缀名
"""
pd.merge(df1, df2, on=None, how="inner", left_on=None, right_on=None, suffixes=('_x', '_y'))

"""行/列拼接
axis (int): 0-按行拼接 1-按列拼接
join (str): 拼接方式 outer-取并集 inner-取交集
ignore_index (bool): 是否丢弃原索引
"""
pd.concat((df1, df2), axis=0, join='outer', ignore_index=False)

# 分组 group_by

col_group = df.groupby([col1])
temp = col_group[col2].unique() # 统计各个类别col1中col2中不同类型的数目

df.groupby([col1])[col2].value_counts().unstack() # 统计各个类别col1中col2中不同类型的数目