Hi @Atrebas , great article comparing the two data processing tools. The concise syntax of Rdatatable is appealing - R is not my strong suit, so hopefully my time with #pydatatable will be fun. The code below are excerpts of some parts of your Pandas code I believe could be better written, and made more performant. For some scenarios, Python's verbose syntax is unavoidable. I put comments for each of them to refer to the particular part of the code. For some, I have no idea (rleid, %inrange%,...). When I have some time again, I will have a look at it, check some codes online (the rdatatable explanation wasnt too clear for me) and then I may revisit it. Either ways, thanks for your article. Was a good read. Same for the dplyr comparision as well. Cheers.
import pandas as pd
import numpy as np
df = pd.DataFrame(
{"V1" : [1, 2, 1, 2, 1, 2, 1, 2, 1],
"V2" : [1, 2, 3, 4, 5, 6, 7, 8, 9],
"V3" : [0.5, 1.0, 1.5] * 3,
"V4" : ['A', 'B', 'C'] * 3})
df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
V3 |
V4 |
| 0 |
1 |
1 |
0.5 |
A |
| 1 |
2 |
2 |
1.0 |
B |
| 2 |
1 |
3 |
1.5 |
C |
| 3 |
2 |
4 |
0.5 |
A |
| 4 |
1 |
5 |
1.0 |
B |
| 5 |
2 |
6 |
1.5 |
C |
| 6 |
1 |
7 |
0.5 |
A |
| 7 |
2 |
8 |
1.0 |
B |
| 8 |
1 |
9 |
1.5 |
C |
#Discard rows using negative indices
df.loc[~df.index.isin(range(2,7))] #no need for the list constructor
#or df.query("not index.between(2,6)", engine='python')
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
V3 |
V4 |
| 0 |
1 |
1 |
0.5 |
A |
| 1 |
2 |
2 |
1.0 |
B |
| 7 |
2 |
8 |
1.0 |
B |
| 8 |
1 |
9 |
1.5 |
C |
#Filter rows using multiple conditions
df.loc[(df.V1==1) & (df.V4=="A")]
# or df.query("V1==1 and V4=='A' ")
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
V3 |
V4 |
| 0 |
1 |
1 |
0.5 |
A |
| 6 |
1 |
7 |
0.5 |
A |
#other filters
#tuples used when selecting multiple starts (("B","C",...))
df.loc[df.V4.str.startswith("B")]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
V3 |
V4 |
| 1 |
2 |
2 |
1.0 |
B |
| 4 |
1 |
5 |
1.0 |
B |
| 7 |
2 |
8 |
1.0 |
B |
#DT[V2 %inrange% list(-1:1, 1:3)]
#dont understand this yet, will look into it
#Select one column using an index (not recommended)
df.iloc[:,[2]] #DF.iloc[:, 2].to_frame() unneccessary
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V3 |
| 0 |
0.5 |
| 1 |
1.0 |
| 2 |
1.5 |
| 3 |
0.5 |
| 4 |
1.0 |
| 5 |
1.5 |
| 6 |
0.5 |
| 7 |
1.0 |
| 8 |
1.5 |
cols = ['V2', 'V3']
df.loc[:, cols]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V2 |
V3 |
| 0 |
1 |
0.5 |
| 1 |
2 |
1.0 |
| 2 |
3 |
1.5 |
| 3 |
4 |
0.5 |
| 4 |
5 |
1.0 |
| 5 |
6 |
1.5 |
| 6 |
7 |
0.5 |
| 7 |
8 |
1.0 |
| 8 |
9 |
1.5 |
#summarise one column
df.loc[:, ["V1"]].agg(['sum']) # unnecessary : DF[['V1']].sum().to_frame(name = 'sumV1')
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
#Summarise several columns
#Pandas version 1.1.0 supports renaming using namedtuples
#previously available only for aggregations after groupby
#not as pretty as rdatatable
#Pandas places the aggregations as indices
df.agg(sumV1=("V1",'sum'), sdv3=("V3","std"))
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V3 |
| sumV1 |
13.0 |
NaN |
| sdv3 |
NaN |
0.433013 |
##Summarise a subset of rows
df.loc[:3,"V1"].sum() #0 is redundant
df.loc[0,'V3'] #df.head(1).V3 not necessary and inefficient
df.at[df.index[-1], 'V3'] #df.tail(1).V3 not necessary and inefficient
#at method useful and a bit more performant than loc if interested in only scalars
#trying to keep up with changes from here
df.loc[:, "V1"] = df.loc[:, "V1"] ** 2
df = df.assign(v5 = np.log(df.V1))
df = df.assign(v6 = np.sqrt(df.V1), v7 = 'X')
#Create one column and remove the others
#no need to create another dataframe
df.loc[:, ["V3"]].add(1).rename(columns={"V3":"V8"})
# or df.loc[:, ["V3"]].add(1).set_axis(["V8"], axis = 1)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V8 |
| 0 |
1.5 |
| 1 |
2.0 |
| 2 |
2.5 |
| 3 |
1.5 |
| 4 |
2.0 |
| 5 |
2.5 |
| 6 |
1.5 |
| 7 |
2.0 |
| 8 |
2.5 |
df = df.drop(["v6", "v7"],1)
#Replace values for rows matching a condition
df.loc[df.loc[:, "V2"] < 4, "V2"] = 0
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
V4 |
| 0 |
1 |
0 |
A |
| 1 |
4 |
0 |
B |
| 2 |
1 |
0 |
C |
| 3 |
4 |
4 |
A |
| 4 |
1 |
5 |
B |
| 5 |
4 |
6 |
C |
| 6 |
1 |
7 |
A |
| 7 |
4 |
8 |
B |
| 8 |
1 |
9 |
C |
#by group
df.groupby(["V4"],as_index=False).agg(sumV2=("V2","sum")) #no need for the to frame and reset index
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V4 |
sumV2 |
| 0 |
A |
11 |
| 1 |
B |
13 |
| 2 |
C |
15 |
#several groups
df.groupby(["V4","V1"], as_index = False).agg(sumV2=("V2","sum")) # again, no need for the to frame and reset index
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V4 |
V1 |
sumV2 |
| 0 |
A |
1 |
7 |
| 1 |
A |
4 |
4 |
| 2 |
B |
1 |
5 |
| 3 |
B |
4 |
8 |
| 4 |
C |
1 |
9 |
| 5 |
C |
4 |
6 |
#Calling function in by
#you could apply the function before grouping
df.assign(V4 = df.V4.str.lower()).groupby(["V4"], as_index=False).agg(sumV1=("V1","sum"))
# or df.groupby(df.V4.str.lower()).agg(sumV1=("V1","sum")).reset_index()
# reset_index becomes inevitable here
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V4 |
sumV1 |
| 0 |
a |
6 |
| 1 |
b |
9 |
| 2 |
c |
6 |
#Assigning column name in by
#not possible in Pandas
#create the name before grouping
df.assign(abc = lambda x: x.V4.str.lower()).groupby(["abc"], as_index=False).agg(sumV1=("V1","sum"))
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
abc |
sumV1 |
| 0 |
a |
6 |
| 1 |
b |
9 |
| 2 |
c |
6 |
#Using a condition in by
#just trying to keep track here
df.groupby(df.V4=="A").V1.sum()
V4
False 15
True 6
Name: V1, dtype: int64
#on a subset of rows
df.iloc[:5].groupby("V4").agg(sumV1=("V1",'sum'))
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
#Add a column with number of observations for each group
df.assign(n = lambda x: x.groupby("V1").V4.transform("count"))
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
V4 |
n |
| 0 |
1 |
0 |
A |
5 |
| 1 |
4 |
0 |
B |
4 |
| 2 |
1 |
0 |
C |
5 |
| 3 |
4 |
4 |
A |
4 |
| 4 |
1 |
5 |
B |
5 |
| 5 |
4 |
6 |
C |
4 |
| 6 |
1 |
7 |
A |
5 |
| 7 |
4 |
8 |
B |
4 |
| 8 |
1 |
9 |
C |
5 |
#Summarise all the columns
#better to use built-in functions over apply where possible
#as they are optimised/vectorized
#apply is a for loop, handy in some situations, but relatively slow in a lot of cases
df.agg(["max"])
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
#Summarise several columns
df.loc[:, ["V1","V2"]].agg(["mean"]) #again, no need for apply
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
| mean |
2.333333 |
4.333333 |
# Summarise several columns by group
# unpacking a dictionary comprehension suffices here
df.groupby("V4").agg(**{f"{col}_mean":(col, "mean") for col in ["V1","V2"]})
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1_mean |
V2_mean |
| V4 |
|
|
| A |
2 |
3.666667 |
| B |
3 |
4.333333 |
| C |
2 |
5.000000 |
#Summarise using a condition
df.select_dtypes(include="number").agg(["mean"])
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
| mean |
2.333333 |
4.333333 |
#Modify all the columns
#reverse on the 0 axis
#apply not needed
df.iloc[::-1]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
V4 |
| 8 |
1 |
9 |
C |
| 7 |
4 |
8 |
B |
| 6 |
1 |
7 |
A |
| 5 |
4 |
6 |
C |
| 4 |
1 |
5 |
B |
| 3 |
4 |
4 |
A |
| 2 |
1 |
0 |
C |
| 1 |
4 |
0 |
B |
| 0 |
1 |
0 |
A |
#Modify several columns (dropping the others)
df.filter(["V1","V2"]).agg(np.sqrt)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
| 0 |
1.0 |
0.000000 |
| 1 |
2.0 |
0.000000 |
| 2 |
1.0 |
0.000000 |
| 3 |
2.0 |
2.000000 |
| 4 |
1.0 |
2.236068 |
| 5 |
2.0 |
2.449490 |
| 6 |
1.0 |
2.645751 |
| 7 |
2.0 |
2.828427 |
| 8 |
1.0 |
3.000000 |
#Modify several columns (dropping the others)
df.filter(regex='[^V4]').agg(np.exp)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
| 0 |
2.718282 |
1.000000 |
| 1 |
54.598150 |
1.000000 |
| 2 |
2.718282 |
1.000000 |
| 3 |
54.598150 |
54.598150 |
| 4 |
2.718282 |
148.413159 |
| 5 |
54.598150 |
403.428793 |
| 6 |
2.718282 |
1096.633158 |
| 7 |
54.598150 |
2980.957987 |
| 8 |
2.718282 |
8103.083928 |
#Modify several columns (keeping the others)
df.loc[:, "V1":"V2"] = df.filter(["V1","V2"]).agg(np.sqrt)
df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
V4 |
| 0 |
1.0 |
0.000000 |
A |
| 1 |
2.0 |
0.000000 |
B |
| 2 |
1.0 |
0.000000 |
C |
| 3 |
2.0 |
2.000000 |
A |
| 4 |
1.0 |
2.236068 |
B |
| 5 |
2.0 |
2.449490 |
C |
| 6 |
1.0 |
2.645751 |
A |
| 7 |
2.0 |
2.828427 |
B |
| 8 |
1.0 |
3.000000 |
C |
##Modify several columns (keeping the others)
cols = df.columns.difference(["V4"])
df.loc[:, cols] = df.filter(cols).agg(lambda x: pow(x,2))
df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
V4 |
| 0 |
1.0 |
0.0 |
A |
| 1 |
4.0 |
0.0 |
B |
| 2 |
1.0 |
0.0 |
C |
| 3 |
4.0 |
4.0 |
A |
| 4 |
1.0 |
5.0 |
B |
| 5 |
4.0 |
6.0 |
C |
| 6 |
1.0 |
7.0 |
A |
| 7 |
4.0 |
8.0 |
B |
| 8 |
1.0 |
9.0 |
C |
#Modify columns using a condition (dropping the others)
#no need for intermediate cols variable
df.select_dtypes("number").sub(1)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
| 0 |
0.0 |
-1.0 |
| 1 |
3.0 |
-1.0 |
| 2 |
0.0 |
-1.0 |
| 3 |
3.0 |
3.0 |
| 4 |
0.0 |
4.0 |
| 5 |
3.0 |
5.0 |
| 6 |
0.0 |
6.0 |
| 7 |
3.0 |
7.0 |
| 8 |
0.0 |
8.0 |
#Modify columns using a condition (keeping the others)
df.loc[:, cols] = df.loc[:, cols].astype(int)
df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
V4 |
| 0 |
1 |
0 |
A |
| 1 |
4 |
0 |
B |
| 2 |
1 |
0 |
C |
| 3 |
4 |
4 |
A |
| 4 |
1 |
5 |
B |
| 5 |
4 |
5 |
C |
| 6 |
1 |
7 |
A |
| 7 |
4 |
8 |
B |
| 8 |
1 |
9 |
C |
df.groupby("V4").head(2).assign(V2="X").sort_values("V4")
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
V4 |
| 0 |
1 |
X |
A |
| 3 |
4 |
X |
A |
| 1 |
4 |
X |
B |
| 4 |
1 |
X |
B |
| 2 |
1 |
X |
C |
| 5 |
4 |
X |
C |
#not sure what this does
#will study it some more
#Use multiple expressions (with DT[,{j}])
#Expression chaining using DT[][] (recommended)
df.groupby(['V4'], as_index=False).agg(V1sum=("V1",np.sum)).query("V1sum > 5")
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V4 |
V1sum |
| 0 |
A |
6 |
| 1 |
B |
9 |
| 2 |
C |
6 |
df.set_index('V4', drop = False, inplace = True)
df.sort_index(inplace = True)
df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
V4 |
| V4 |
|
|
|
| A |
1 |
0 |
A |
| A |
4 |
4 |
A |
| A |
1 |
7 |
A |
| B |
4 |
0 |
B |
| B |
1 |
5 |
B |
| B |
4 |
8 |
B |
| C |
1 |
0 |
C |
| C |
4 |
5 |
C |
| C |
1 |
9 |
C |
#Apply a function on the matching rows
df.loc[['A', 'C'],'V1'].sum() #faster than df.loc[['A', 'C']].V1.sum() as the data fetching is done once
#keeping pace with the material
#Modify values for matching rows
df.loc['A', 'V1'] = 0
#Use keys in by
df.query("index != 'B'").groupby(level=0).agg({"V1":np.sum})
# or df.loc[~(df.index=="B")].groupby(level=0).agg({"V1":"sum"})
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
#Set keys/indices for multiple columns
df.set_index(['V4', 'V1'], drop = False, inplace = True)
df.sort_index(inplace = True)
df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
|
V1 |
V2 |
V4 |
| V4 |
V1 |
|
|
|
| A |
0 |
0 |
0 |
A |
| 0 |
0 |
4 |
A |
| 0 |
0 |
7 |
A |
| B |
1 |
1 |
5 |
B |
| 4 |
4 |
0 |
B |
| 4 |
4 |
8 |
B |
| C |
1 |
1 |
0 |
C |
| 1 |
1 |
9 |
C |
| 4 |
4 |
5 |
C |
#Subset using multiple keys/indices
df.loc[("C",1)]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
|
V1 |
V2 |
V4 |
| V4 |
V1 |
|
|
|
| C |
1 |
1 |
0 |
C |
| 1 |
1 |
9 |
C |
#Subset using multiple keys/indices
df.loc[(("B","C"),1),:]
#or query
#can be much clearer
#df.query("V4 in ('B','C') and V1==1")
# you can rename the indices
# to separate it from the columns
# the columns will take precedence if index and columns have the same name
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
|
V1 |
V2 |
V4 |
| V4 |
V1 |
|
|
|
| B |
1 |
1 |
5 |
B |
| C |
1 |
1 |
0 |
C |
| 1 |
1 |
9 |
C |
df.query("V4 in ('B','C') and V1==1")
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
|
V1 |
V2 |
V4 |
| V4 |
V1 |
|
|
|
| B |
1 |
1 |
5 |
B |
| C |
1 |
1 |
0 |
C |
| 1 |
1 |
9 |
C |
#remove keys/indices
df.reset_index(inplace = True, drop = True)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
V4 |
| 0 |
0 |
0 |
A |
| 1 |
0 |
4 |
A |
| 2 |
0 |
7 |
A |
| 3 |
1 |
5 |
B |
| 4 |
4 |
0 |
B |
| 5 |
4 |
8 |
B |
| 6 |
1 |
0 |
C |
| 7 |
1 |
9 |
C |
| 8 |
4 |
5 |
C |
#trying to keep pace
df.iloc[0, 1] = 3
df
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
V4 |
| 0 |
0 |
3 |
A |
| 1 |
0 |
4 |
A |
| 2 |
0 |
7 |
A |
| 3 |
1 |
5 |
B |
| 4 |
4 |
0 |
B |
| 5 |
4 |
8 |
B |
| 6 |
1 |
0 |
C |
| 7 |
1 |
9 |
C |
| 8 |
4 |
5 |
C |
df.sort_values(['V4','V1'], ascending = [True, False], inplace = True)
df.rename(columns = {'V2':'v2'}, inplace = True)
cols = df.columns.values; cols[1] = 'V2'
df.columns = cols
df = df[['V4', 'V1', 'V2']]
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V4 |
V1 |
V2 |
| 0 |
A |
0 |
3 |
| 1 |
A |
0 |
4 |
| 2 |
A |
0 |
7 |
| 4 |
B |
4 |
0 |
| 5 |
B |
4 |
8 |
| 3 |
B |
1 |
5 |
| 8 |
C |
4 |
5 |
| 6 |
C |
1 |
0 |
| 7 |
C |
1 |
9 |
#Get row number of first (and last) observation by group
pd.DataFrame(df.groupby("V4").indices).melt() #returns all the indices
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
variable |
value |
| 0 |
A |
0 |
| 1 |
A |
1 |
| 2 |
A |
2 |
| 3 |
B |
3 |
| 4 |
B |
4 |
| 5 |
B |
5 |
| 6 |
C |
6 |
| 7 |
C |
7 |
| 8 |
C |
8 |
pd.DataFrame(df.groupby("V4").indices).loc[1] #get the second row per group
A 1
B 4
C 7
Name: 1, dtype: int64
pd.DataFrame(df.groupby("V4").indices).iloc[[0,-1]].melt() #get first and last indices per group
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
variable |
value |
| 0 |
A |
0 |
| 1 |
A |
2 |
| 2 |
B |
3 |
| 3 |
B |
5 |
| 4 |
C |
6 |
| 5 |
C |
8 |
df.to_csv("test.csv", index = False)
#drop columns when reading csv
pd.read_csv("test.csv", usecols = lambda x: x != "V4")
# or pd.read_csv("test.csv", usecols = lambda x: x not in ["V4"])
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V1 |
V2 |
| 0 |
0 |
3 |
| 1 |
0 |
4 |
| 2 |
0 |
7 |
| 3 |
4 |
0 |
| 4 |
4 |
8 |
| 5 |
1 |
5 |
| 6 |
4 |
5 |
| 7 |
1 |
0 |
| 8 |
1 |
9 |
mdf = df.melt(id_vars = "V4", value_vars=("V1","V2"))
mdf
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
|
V4 |
variable |
value |
| 0 |
A |
V1 |
0 |
| 1 |
A |
V1 |
0 |
| 2 |
A |
V1 |
0 |
| 3 |
B |
V1 |
4 |
| 4 |
B |
V1 |
4 |
| 5 |
B |
V1 |
1 |
| 6 |
C |
V1 |
4 |
| 7 |
C |
V1 |
1 |
| 8 |
C |
V1 |
1 |
| 9 |
A |
V2 |
3 |
| 10 |
A |
V2 |
4 |
| 11 |
A |
V2 |
7 |
| 12 |
B |
V2 |
0 |
| 13 |
B |
V2 |
8 |
| 14 |
B |
V2 |
5 |
| 15 |
C |
V2 |
5 |
| 16 |
C |
V2 |
0 |
| 17 |
C |
V2 |
9 |
#cast data from long to wide
pd.crosstab(mdf.V4, mdf.variable)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| variable |
V1 |
V2 |
| V4 |
|
|
| A |
3 |
3 |
| B |
3 |
3 |
| C |
3 |
3 |
#cast data from long to wide
pd.crosstab(mdf.V4, mdf.variable, values = mdf.value, aggfunc='sum')
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| variable |
V1 |
V2 |
| V4 |
|
|
| A |
0 |
14 |
| B |
9 |
13 |
| C |
6 |
14 |
#cast data from long to wide
pd.crosstab(mdf.V4, mdf.value > 5)
<style scoped>
.dataframe tbody tr th:only-of-type {
vertical-align: middle;
}
.dataframe tbody tr th {
vertical-align: top;
}
.dataframe thead th {
text-align: right;
}
</style>
| value |
False |
True |
| V4 |
|
|
| A |
5 |
1 |
| B |
5 |
1 |
| C |
5 |
1 |