Hive和Spark常用代码片段

Hive

  • 日期类
功能 示例代码 备注
日期有无横杠转换 date_format(from_unixtime(unix_timestamp(‘20240101’, ‘yyyyMMdd’)), ‘yyyy-MM-dd’) 输出”2024-01-01”
日期加减 date_format(date_add(from_unixtime(unix_timestamp(‘20240101’, ‘yyyyMMdd’)), 7), ‘yyyyMMdd’) 输出”2024-01-08”
日期差 datediff(from_unixtime(unix_timestamp(‘20240105’, ‘yyyyMMdd’)), from_unixtime(unix_timestamp(‘20240101’, ‘yyyyMMdd’))) 输出 4

Spark

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
import time, datetime
from spark_util import spark_start

if 'spark' not in globals():
spark = spark_start('data_explore_zt', executor_core=2)
spark.sparkContext.setLogLevel("ERROR")

def show(sdf, n=None, truncate=True):
import pandas as pd
pd.options.display.max_columns = 100
if not truncate:
pd.options.display.max_colwidth = 999
n = n or (5 if len(sdf.columns) > 10 else 10)
return display(pd.DataFrame([x.asDict() for x in sdf.take(n)]))

def get_date(n=-1, base_date=None, base_date_FMT=None, FMT='%Y%m%d'):
if base_date is None:
try:
import pytz
base_datetime = datetime.datetime.now(pytz.timezone('Asia/Shanghai'))
except Exception:
base_datetime = datetime.datetime.now()
else:
base_datetime = datetime.datetime.strptime(base_date, base_date_FMT or FMT)
return datetime.datetime.strftime(base_datetime + datetime.timedelta(n), FMT)

def date_range(start, end, end_include=False, step=1, FMT="%Y%m%d"):
strptime, strftime = datetime.datetime.strptime, datetime.datetime.strftime
days = (strptime(end, FMT) - strptime(start, FMT)).days
days = days + int(step/abs(step)) if end_include else days # +1 OR -1
return [strftime(strptime(start, FMT) + datetime.timedelta(i), FMT) for i in range(0, days, step)]


SparkSQL

配置

1
2
3
4
5
6
7
8
9
10
11
12
{

"spark.executor.instances": "4",

"spark.executor.memory": "3g",

"spark.executor.cores": "2",

"spark.hadoop.mapreduce.fileoutputcommitter.algorithm.version": "2"

}