[SQL]表基础探查脚本

PgSQL,表基础探查脚本

表情况

表字段信息

SELECT attname FROM pg_attribute
WHERE attrelid = 'spfjbasepop.b01_gjjgrzhxx'::regclass  -- 表名
AND attnum > 0 LIMIT 1;

表数据量

  • 计算表中的数据量和去重数据量
-- 表数据量 
SELECT COUNT(1) 
FROM "spfjbasepop".b01_gjjgrzhxx ;
-- 表去重数据量 
SELECT DISTINCT COUNT(1) 
FROM "spfjbasepop".b01_gjjgrzhxx ;

数据时间访问统计

  • 计算表中数据的时间范围
-- 数据最晚交易日期:
SELECT MAX(zxjysj) 
FROM"spfjbasepop".b01_gjjgrzhxx

-- 数据最早交易日期:
SELECT MIN(zxjysj) FROM (
	SELECT * FROM "spfjbasepop".b01_gjjgrzhxx 
	WHERE 
	zxjysj > '1900-01-01 00:00:00.0'
);

字段情况

字段基础信息

  • 查询最小值/最大值/各分点/空值率/去重条数
-- 查询最小值/最大值/各分点/空值率/去重条数:
SELECT  
	'(dwb_source_table) ', -- 字段名
	MIN(dwb_source_table) AS min_value,-- 最小值
 	PERCENTILE_DISC(0.25) WITHIN GROUP (ORDER BY dwb_source_table) AS value1_4,  -- 25%
	PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY dwb_source_table) AS value1_2, -- 50%
	PERCENTILE_DISC(0.75) WITHIN GROUP (ORDER BY dwb_source_table) AS value3_4, -- 75%
	MAX(dwb_source_table) AS max_value,  -- 最大值
	(SUM(CASE WHEN (TRIM(dwb_source_table::VARCHAR) ='' OR dwb_source_table IS NULL ) THEN 1 ELSE 0 END)/COUNT(*)::numeric) AS null_percent, -- 空值率
	COUNT(DISTINCT dwb_source_table) AS distincit_num, -- 去重记录数
	AVG(CASE WHEN pg_typeof(dwb_source_table)::VARCHAR IN ('numeric','integer','float') THEN dwb_source_table ELSE NULL END) AS average_value  -- 平均值
FROM 
	"spfjbasepop".b01_gjjgrzhxx; 

自动化探查脚本

  • 可通过EXCEL表自动生成探查脚本:
A B C D E F G H I
1 库中文名 库英文名 表中文名 表英文名 字段中文名 字段英文名 字段类型 单字段探查脚本 全表探查脚本
2 人口库基础层 spfjbasepop 公积金个人账户信息 b01_gjjgrzhxx 个人账户编号 grzhbh string =CONCAT("SELECT '",F2,"',","MIN(",F2,")::VARCHAR AS min_value",",PERCENTILE_DISC(0.25) WITHIN GROUP (ORDER BY ",F2,")::VARCHAR AS value1_4",",PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY ",F2," )::VARCHAR AS value1_2",",PERCENTILE_DISC(0.75) WITHIN GROUP (ORDER BY ",F2," )::VARCHAR AS value3_4",",MAX(",F2,")::VARCHAR AS max_value",",AVG(CASE WHEN pg_typeof(",F2,")::VARCHAR IN ('numeric','integer','float') THEN ",F2,"::VARCHAR ELSE NULL END)::NUMBER AS average_value",",(SUM(CASE WHEN (TRIM(",F2,"::VARCHAR) ='' OR ",F2," IS NULL) THEN 1 ELSE 0 END)/COUNT(*)::numeric) AS null_percent",",COUNT(DISTINCT ",F2,")AS distincit_num"," from ",B2,".",D2) =TEXTJOIN(" UNION ALL ",TRUE,H:H)
  • 自动脚本
-- 单字段探查脚本
=CONCAT("SELECT '",F2,"',","MIN(",F2,")::VARCHAR AS min_value",",PERCENTILE_DISC(0.25) WITHIN GROUP (ORDER BY ",F2,")::VARCHAR AS value1_4",",PERCENTILE_DISC(0.5) WITHIN GROUP (ORDER BY ",F2," )::VARCHAR AS value1_2",",PERCENTILE_DISC(0.75) WITHIN GROUP (ORDER BY ",F2," )::VARCHAR AS value3_4",",MAX(",F2,")::VARCHAR AS max_value",",AVG(CASE WHEN pg_typeof(",F2,")::VARCHAR IN ('numeric','integer','float') THEN ",F2,"::VARCHAR ELSE NULL END)::NUMBER AS average_value",",(SUM(CASE WHEN (TRIM(",F2,"::VARCHAR) ='' OR ",F2," IS NULL) THEN 1 ELSE 0 END)/COUNT(*)::numeric) AS null_percent",",COUNT(DISTINCT ",F2,")AS distincit_num"," from ",B2,".",D2)
 -- 全表探查脚本
 =TEXTJOIN(" UNION ALL ",TRUE,H:H) 

维度字段频次统计

  • 统计维度字段在全表中出现的频次情况
SELECT
	grzhztdm,
	COUNT(1) AS frequence
FROM
	"spfjbasepop".b01_gjjgrzhxx
GROUP BY
	grzhztdm
ORDER BY 
	grzhztdm;

时间字段按年按月统计

  • 按年统计
-- 时间维度按年的频次统计:
SELECT
  time2year,
  COUNT(1)
FROM
  (
    SELECT
      to_char (khrq, 'yyyy') AS time2year -- 时间转为年份
    FROM
      "spfjbasepop".b01_gjjgrzhxx
  )
GROUP BY
  time2year
ORDER BY
  time2year;
  • 按月统计
-- 时间维度按月的频次统计:  
SELECT
  time2month,
  COUNT(1)
FROM
  (
    SELECT
      to_char (khrq, 'yyyy-mm') AS time2month -- 时间转为月份
    FROM
      "spfjbasepop".b01_gjjgrzhxx
  )
GROUP BY
  time2month
ORDER BY
  time2month;

度量字段分区间分段统计

-- 度量值分区间分段统计:  
SELECT 
  range_min, 
  range_max,
  COUNT(*) 
FROM (
  SELECT
	 min_value + (max_value - min_value) *seq/part AS range_min, -- 最小区间
	 min_value + (max_value - min_value) *(seq+1)/part AS range_max -- 最大区间
  FROM 
  	LATERAL (SELECT 0 AS min_value) AS min_value, -- **分段的最小值**
	LATERAL (SELECT 10000 AS max_value) AS max_value, -- **分段的最大值**
	LATERAL (SELECT 500 AS part) AS part, -- 分段数
	LATERAL(SELECT generate_series(0,part-1) AS seq)AS seq  -- 递增序列
)AS ranges 
JOIN "spfjbasepop".b01_gjjgrzhxx   -- **表   名** 
ON   gryjce >= ranges.range_min   -- **字段名**
AND gryjce <   ranges.range_max   -- **字段名** 
GROUP BY range_min, range_max 
ORDER BY range_min;

自动化探查脚本

  • 可通过EXCEL表自动生成探查脚本:
A B C D E F G H I J K
1 库中文名 库英文名 表中文名 表英文名 字段中文名 字段英文名 字段类型 单字段探查脚本 全表探查脚本 单字段维度/度量探查脚本 全表维度/度量探查脚本
1 库中文名 库英文名 表中文名 表英文名 字段中文名 字段英文名 字段类型 单字段探查脚本 全表探查脚本
2 人口库基础层 spfjbasepop 公积金个人账户信息 b01_gjjgrzhxx 个人账户编号 grzhbh string 见字段情况章节 见字段情况章节 见下方自动脚本 见下方自动脚本
  • 自动脚本
-- 单字段维度/度量探查脚本
= @IFS(OR(G2="string"),CONCAT("SELECT ",F2," AS ",D2,"_",F2,",COUNT(1) AS ",D2,"_",F2,"_frequence FROM ",B2,".",D2," GROUP BY ",F2," ORDER BY ",F2,";
","SELECT frequence AS ",D2,"_",F2,"_frequence,COUNT(1) AS ",D2,"_",F2,"_frequence_frequence from (","SELECT ",F2,",COUNT(1) AS frequence FROM ",B2,".",D2," GROUP BY ",F2," ORDER BY ",F2,")GROUP BY frequence;"),OR(G2="date"),CONCAT("SELECT time2year AS ",D2,"_",F2,"2year, COUNT(1) AS ",D2,"_",F2,"2year_frequence FROM (SELECT to_char(",F2,",'yyyy') AS time2year FROM ",B2,".",D2,") GROUP BY time2year ORDER BY time2year;","SELECT time2month AS ",D2,"_",F2,"2month, COUNT(1) AS ",D2,"_",F2,"2month_frequence FROM (SELECT to_char(",F2,",'yyyy-mm') AS time2month FROM ",B2,".",D2,") GROUP BY time2month ORDER BY time2month;"),OR(G2="number"),CONCAT("SELECT range_min AS ",D2,"_",F2,"_range_min, range_max AS ",D2,"_",F2,"_range_max,COUNT(*) AS ",D2,"_",F2,"_frequence FROM(SELECT min_value+(max_value-min_value)*seq/part AS range_min,min_value+(max_value-min_value)*(seq+1)/part AS range_max FROM LATERAL (SELECT min(",F2,") AS min_value FROM ",B2,".",D2,") AS min_value,LATERAL(SELECT max(",F2,") AS max_value FROM ",B2,".",D2,") AS max_value,LATERAL (SELECT 500 AS part) AS part,LATERAL(SELECT generate_series(0,part-1) AS seq)AS seq ) AS ranges JOIN ",B2,".",D2," ON ",F2,">=ranges.range_min AND ",F2,"< ranges.range_max GROUP BY range_min,range_max ORDER BY range_min;
","SELECT range_min AS ",D2,"_",F2,"_range_min, range_max AS ",D2,"_",F2,"_range_max,COUNT(*) AS ",D2,"_",F2,"_frequence FROM(SELECT min_value+(max_value-min_value)*seq/part AS range_min,min_value+(max_value-min_value)*(seq+1)/part AS range_max FROM LATERAL (SELECT PERCENTILE_DISC(0.20) WITHIN GROUP (ORDER BY ",F2,") AS min_value FROM ",B2,".",D2,") AS min_value,LATERAL(SELECT PERCENTILE_DISC(0.80) WITHIN GROUP (ORDER BY ",F2,") AS max_value FROM ",B2,".",D2,") AS max_value,LATERAL (SELECT 500 AS part) AS part,LATERAL(SELECT generate_series(0,part-1) AS seq)AS seq ) AS ranges JOIN ",B2,".",D2," ON ",F2,">=ranges.range_min AND ",F2,"< ranges.range_max GROUP BY range_min,range_max ORDER BY range_min;"))
--  全表维度/度量探查脚本
= =TEXTJOIN("
",TRUE,R:R)