Skip to content

Commit d5c4a69

Browse files
committed
First release before packaging for pip
1 parent 2f7c05d commit d5c4a69

File tree

3 files changed

+100
-0
lines changed

3 files changed

+100
-0
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@ __pycache__/
33
*.py[cod]
44
*$py.class
55

6+
*.sw*
7+
68
# C extensions
79
*.so
810

pyspark_utils/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
from pyspark_utils import p, setupSpark, getSpark, describeWithNulls, toPandas, sparkSql, fromCsvSpark, cols
2+

pyspark_utils/pyspark_utils.py

Lines changed: 96 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,96 @@
1+
from pyspark.sql import *
2+
from pyspark import SparkConf
3+
from pyspark.sql import Row, Column, DataFrame as SDataFrame
4+
from pyspark.sql.types import *
5+
import pandas as pd
6+
import os, sys
7+
from collections import namedtuple as ntup
8+
9+
def p(msg,o=None): omsg = ": %s" %repr(o) if o is not None else ""; print("%s%s\n" %(msg, omsg))
10+
11+
def setupSpark():
12+
os.environ["PYSPARK_SUBMIT_ARGS"] = "pyspark-shell"
13+
pd.set_option('display.max_rows', 20)
14+
pd.set_option('display.max_colwidth', -1)
15+
pd.set_option('display.max_columns', None)
16+
pd.set_option('expand_frame_repr', False)
17+
spark = SparkSession.builder.appName("pyspark_utils").master("local").getOrCreate()
18+
return spark
19+
20+
# # Start the Apache Spark server
21+
#
22+
def getSpark():
23+
spark = spark if 'spark' in globals() else setupSpark()
24+
return spark
25+
26+
def describeWithNulls(df, doPrint=True):
27+
# df = pd.DataFrame({ 'a': [1,2,3], 'b': ['a','b','c'], 'c': [99.5,11.2, 433.1], 'd':[123,'abc',None]})
28+
desc = df.describe() # Returns a DataFrame with stats in the row index
29+
combo = pd.concat([df.isna().sum(),desc.T],axis=1).set_axis(['Nulls']+list(desc.index),axis=1,inplace=False)
30+
if doPrint:
31+
p(combo.head(100))
32+
return combo
33+
34+
# Read a Parquet File into Spark DataFRame and also create Pandas Dataframe
35+
#
36+
def toPandas(path, tname, sql=None, count=False):
37+
df = getSpark().read.parquet(path)
38+
if count: p(tname + ": count="+str(df.count()))
39+
df.createOrReplaceTempView(tname)
40+
if sql is not None:
41+
df = getSpark().sql(sql)
42+
df.createOrReplaceTempView(tname)
43+
pdf = df.toPandas()
44+
describeWithNulls(pdf)
45+
return df,pdf
46+
47+
# Run a Spark SQL and return pandas and spark dataframes
48+
#
49+
def sparkSql(sql, tname=None, count=True, describe=False):
50+
sdf = getSpark().sql(sql)
51+
if count and tname: p(tname + ": count="+str(sdf.count()))
52+
if tname: sdf.createOrReplaceTempView(tname)
53+
pdf = sdf.toPandas()
54+
with pd.option_context('display.max_rows', None, 'display.max_columns', None): # more options can be specified also
55+
print(pdf)
56+
if describe:
57+
describeWithNulls(pdf)
58+
return sdf,pdf
59+
60+
# Read a CSV and create pandas and spark dataframes
61+
#
62+
def fromCsvSpark(path, tname, header=True, count=True, sql=None):
63+
p('Reading from %s..' %path)
64+
df = getSpark().read.csv(path,header=header)
65+
if count: p(tname + ": count="+str(df.count()))
66+
df.createOrReplaceTempView(tname)
67+
if sql is not None:
68+
df = getSpark().sql(sql)
69+
df.createOrReplaceTempView(tname)
70+
pdf = df.toPandas()
71+
describeWithNulls(pdf)
72+
return df,pdf
73+
74+
# Return selected columns of pandas DF
75+
#
76+
def cols(df, cnames):
77+
cnames = cnames if isinstance(cnames,list) else [cnames]
78+
return df.loc[:,cnames]
79+
80+
# Tokenization Spark UDF using comma delimiter
81+
#
82+
def tokenize(txt):
83+
def stripx(x): return x.strip()
84+
if txt.find(',') < 0: return txt
85+
else:
86+
toks = list(map(stripx, txt.split(',')))
87+
# return [toks[0], toks]
88+
return toks
89+
90+
if __name__ == '__main__':
91+
sampleCode="""
92+
from pyspark.sql.types import *
93+
from pyspark.sql.functions import udf
94+
tokenize_udf = udf(tokenize, ArrayType(StructType([StructField("tok", StringType(), False)])))
95+
spark.udf.register("tokenize",tokenize_udf)
96+
"""

0 commit comments

Comments
 (0)