`
k_lb
  • 浏览: 802251 次
  • 性别: Icon_minigender_1
  • 来自: 郑州
社区版块
存档分类
最新评论
  • kitleer: 据我所知,国内有款ETL调度监控工具TaskCTL,支持ket ...
    kettle调度

HIVE Queryes 查询

阅读更多

SELECT … FROM Clauses


hive> SELECT name, salary FROM employees;


表别名

hive> SELECT   name,   salary FROM employees;
hive> SELECT e.name, e.salary FROM employees e;


Specify Columns with Regular Expressions


hive> SELECT symbol, `price.*` FROM stocks;
AAPL    195.69  197.88  194.0   194.12  194.12
AAPL    192.63  196.0   190.85  195.46  195.46
AAPL    196.73  198.37  191.57  192.05  192.05
AAPL    195.17  200.2   194.42  199.23  199.23
AAPL    195.91  196.32  193.38  195.86  195.86

Computing with Column Values

hive> SELECT upper(name), salary, deductions["Federal Taxes"],
    > round(salary * (1 - deductions["Federal Taxes"])) FROM employees;

JOHN DOE 100000.0 0.2 80000
MARY SMITH 80000.0 0.2 64000
TODD JONES 70000.0 0.15 59500
BILL KING 60000.0 0.15 51000

Arithmetic Operators --算数运算



Mathematical functions






Aggregate functions -- 集合函数





hive> SET hive.map.aggr=true;
hive> SELECT count(*), avg(salary) FROM employees;

Table generating functions


hive> SELECT explode(subordinates) AS sub FROM employees;
Mary Smith
Todd Jones
Bill King



Other built-in functions




LIMIT Clause --限制行数


hive> SELECT upper(name), salary, deductions["Federal Taxes"],
    > round(salary * (1 - deductions["Federal Taxes"])) FROM employees
    > LIMIT 2;
JOHN DOE    100000.0  0.2   80000
MARY SMITH   80000.0  0.2   64000

Column Aliases --列别名


hive> SELECT upper(name), salary, deductions["Federal Taxes"] as fed_taxes,
    > round(salary * (1 - deductions["Federal Taxes"])) as salary_minus_fed_taxes
    > FROM employees LIMIT 2;
JOHN DOE    100000.0  0.2   80000
MARY SMITH   80000.0  0.2   64000


Nested SELECT Statements --子查询


hive> FROM (
    >   SELECT upper(name), salary, deductions["Federal Taxes"] as fed_taxes,
    >   round(salary * (1 - deductions["Federal Taxes"])) as salary_minus_fed_taxes
    >   FROM employees
    > ) e
    > SELECT e.name, e.salary_minus_fed_taxes
    > WHERE e.salary_minus_fed_taxes > 70000;
JOHN DOE    100000.0  0.2   80000

CASE … WHEN … THEN Statements --case 关键字


hive> SELECT name, salary,
    >   CASE
    >     WHEN salary <  50000.0 THEN 'low'
   > WHEN salary >= 50000.0 AND salary < 70000.0 THEN 'middle'
 > WHEN salary >= 70000.0 AND salary < 100000.0 THEN 'high'
 > ELSE 'very high'
 > END AS bracket FROM employees;
John Doe 100000.0 very high
Mary Smith 80000.0 high
Todd Jones 70000.0 high
Bill King 60000.0 middle
Boss Man 200000.0 very high
Fred Finance 150000.0 very high
Stacy Accountant 60000.0 middle

WHERE Clauses -- 筛选


SELECT * FROM employees
WHERE country = 'US' AND state = 'CA';

hive> SELECT name, salary, deductions["Federal Taxes"],
    >   salary * (1 - deductions["Federal Taxes"])
    > FROM employees
    > WHERE round(salary * (1 - deductions["Federal Taxes"])) > 70000;
John Doe    100000.0  0.2   80000.0

hive>  SELECT name, salary, deductions["Federal Taxes"],
    >    salary * (1 - deductions["Federal Taxes"]) as salary_minus_fed_taxes
    >  FROM employees
    >  WHERE round(salary_minus_fed_taxes) > 70000;
FAILED: Error in semantic analysis: Line 4:13 Invalid table alias or
column reference 'salary_minus_fed_taxes': (possible column names are:
name, salary, subordinates, deductions, address)

hive> SELECT e.* FROM
    > (SELECT name, salary, deductions["Federal Taxes"] as ded,
    >    salary * (1 - deductions["Federal Taxes"]) as salary_minus_fed_taxes
    >  FROM employees) e
    > WHERE round(e.salary_minus_fed_taxes) > 70000;
John Doe        100000.0        0.2     80000.0
Boss Man        200000.0        0.3     140000.0
Fred Finance    150000.0        0.3     105000.0


Predicate Operators


LIKE and RLIKE


hive> SELECT name, address.street FROM employees WHERE address.street LIKE '%Ave.';
John Doe        1 Michigan Ave.
Todd Jones      200 Chicago Ave.
hive> SELECT name, address.city FROM employees WHERE address.city LIKE 'O%';
Todd Jones      Oak Park
Bill King       Obscuria
hive> SELECT name, address.street FROM employees WHERE address.street LIKE '%Chi%';
Todd Jones      200 Chicago Ave.

hive> SELECT name, address.street
    > FROM employees WHERE address.street RLIKE '.*(Chicago|Ontario).*';
Mary Smith      100 Ontario St.
Todd Jones      200 Chicago Ave.

SELECT name, address FROM employees
WHERE address.street LIKE '%Chicago%' OR address.street LIKE '%Ontario%';


GROUP BY Clauses


hive> SELECT year(ymd), avg(price_close) FROM stocks
    > WHERE exchange = 'NASDAQ' AND symbol = 'AAPL'
    > GROUP BY year(ymd);
1984    25.578625440597534
1985    20.193676221040867
1986    32.46102808021274
1987    53.88968399108163
1988    41.540079275138766
1989    41.65976212516664
1990    37.56268799823263
1991    52.49553383386182
1992    54.80338610251119
1993    41.02671956450572
1994    34.0813495847914

HAVING Clauses


hive> SELECT year(ymd), avg(price_close) FROM stocks
    > WHERE exchange = 'NASDAQ' AND symbol = 'AAPL'
    > GROUP BY year(ymd)
        > HAVING avg(price_close) > 50.0;
1987    53.88968399108163
1991    52.49553383386182
1992    54.80338610251119
1999    57.77071460844979
2000    71.74892876261757
2005    52.401745992993554

Inner JOIN

hive> SELECT a.ymd, a.price_close, b.price_close
    > FROM stocks a JOIN stocks b ON a.ymd = b.ymd
    > WHERE a.symbol = 'AAPL' AND b.symbol = 'IBM';
2010-01-04      214.01  132.45
2010-01-05      214.38  130.85
2010-01-06      210.97  130.0
2010-01-07      210.58  129.55
2010-01-08      211.98  130.85
2010-01-11      210.11  129.48


Example 6-1. Query that will not work in Hive

SELECT a.ymd, a.price_close, b.price_close
FROM stocks a JOIN stocks b
ON a.ymd <= b.ymd
WHERE a.symbol = 'AAPL' AND b.symbol = 'IBM';

LEFT OUTER JOIN


hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend
    > FROM stocks s LEFT OUTER JOIN dividends d ON s.ymd = d.ymd AND s.symbol = d.symbol
    > WHERE s.symbol = 'AAPL';
...
1987-05-01      AAPL    80.0    NULL
1987-05-04      AAPL    79.75   NULL
1987-05-05      AAPL    80.25   NULL
1987-05-06      AAPL    80.0    NULL
1987-05-07      AAPL    80.25   NULL
1987-05-08      AAPL    79.0    NULL
1987-05-11      AAPL    77.0    0.015
1987-05-12      AAPL    75.5    NULL
1987-05-13      AAPL    78.5    NULL
1987-05-14      AAPL    79.25   NULL
1987-05-15      AAPL    78.25   NULL
1987-05-18      AAPL    75.75   NULL
1987-05-19      AAPL    73.25   NULL
1987-05-20      AAPL    74.5    NULL
...

OUTER JOIN Gotcha


hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend
    > FROM stocks s LEFT OUTER JOIN dividends d ON s.ymd = d.ymd AND s.symbol = d.symbol
    > WHERE s.symbol = 'AAPL'
    > AND s.exchange = 'NASDAQ' AND d.exchange = 'NASDAQ';
1987-05-11      AAPL    77.0    0.015
1987-08-10      AAPL    48.25   0.015
1987-11-17      AAPL    35.0    0.02
1988-02-12      AAPL    41.0    0.02
1988-05-16      AAPL    41.25   0.02
...

hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend FROM
    > (SELECT * FROM stocks WHERE symbol = 'AAPL' AND exchange = 'NASDAQ') s
    > LEFT OUTER JOIN
    > (SELECT * FROM dividends WHERE symbol = 'AAPL' AND exchange = 'NASDAQ') d
    > ON s.ymd = d.ymd;
...
1988-02-10      AAPL    41.0    NULL
1988-02-11      AAPL    40.63   NULL
1988-02-12      AAPL    41.0    0.02
1988-02-16      AAPL    41.25   NULL
1988-02-17      AAPL    41.88   NULL

RIGHT OUTER JOIN


hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend
    > FROM dividends d RIGHT OUTER JOIN stocks s ON d.ymd = s.ymd AND d.symbol = s.symbol
    > WHERE s.symbol = 'AAPL';
...
1987-05-07      AAPL    80.25   NULL
1987-05-08      AAPL    79.0    NULL
1987-05-11      AAPL    77.0    0.015
1987-05-12      AAPL    75.5    NULL
1987-05-13      AAPL    78.5    NULL


FULL OUTER JOIN


hive> SELECT s.ymd, s.symbol, s.price_close, d.dividend
    > FROM dividends d FULL OUTER JOIN stocks s ON d.ymd = s.ymd AND d.symbol = s.symbol
    > WHERE s.symbol = 'AAPL';
...
1987-05-07      AAPL    80.25   NULL
1987-05-08      AAPL    79.0    NULL
1987-05-11      AAPL    77.0    0.015
1987-05-12      AAPL    75.5    NULL
1987-05-13      AAPL    78.5    NULL
...


LEFT SEMI-JOIN


Example 6-2. Query that will not work in Hive

SELECT s.ymd, s.symbol, s.price_close FROM stocks s
WHERE s.ymd, s.symbol IN
(SELECT d.ymd, d.symbol FROM dividends d);

Instead, you use the following LEFT SEMI JOIN syntax:

hive> SELECT s.ymd, s.symbol, s.price_close
    > FROM stocks s LEFT SEMI JOIN dividends d ON s.ymd = d.ymd AND s.symbol = d.symbol;
...
1962-11-05      IBM     361.5
1962-08-07      IBM     373.25
1962-05-08      IBM     459.5
1962-02-06      IBM     551.5

ORDER BY and SORT BY


Here is an example using ORDER BY:

SELECT s.ymd, s.symbol, s.price_close
FROM stocks s
ORDER BY s.ymd ASC, s.symbol DESC;

Here is the same example using SORT BY instead:

SELECT s.ymd, s.symbol, s.price_cl
FROM stocks s
SORT BY s.ymd ASC, s.symbol DESC;

Casting

SELECT name, salary FROM employees
WHERE cast(salary AS FLOAT) < 100000.0;

Queries that Sample Data -- 抽样


hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) s;
2
4
hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) s;
7
10
hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON rand()) s;

hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON number) s;
2
hive> SELECT * from numbers TABLESAMPLE(BUCKET 5 OUT OF 10 ON number) s;
4
hive> SELECT * from numbers TABLESAMPLE(BUCKET 3 OUT OF 10 ON number) s;
2

hive> SELECT * from numbers TABLESAMPLE(BUCKET 1 OUT OF 2 ON number) s;
2
4
6
8
10
hive> SELECT * from numbers TABLESAMPLE(BUCKET 2 OUT OF 2 ON number) s;
1
3
5
7
9


UNION ALL


SELECT log.ymd, log.level, log.message
  FROM (
    SELECT l1.ymd, l1.level,
      l1.message, 'Log1' AS source
    FROM log1 l1
  UNION ALL
    SELECT l2.ymd, l2.level,
      l2.message, 'Log2' AS source
    FROM log1 l2
  ) log
SORT BY log.ymd ASC;

FROM (
  FROM src SELECT src.key, src.value WHERE src.key < 100
  UNION ALL
  FROM src SELECT src.* WHERE src.key > 110
) unioninput
INSERT OVERWRITE DIRECTORY '/tmp/union.out' SELECT unioninput.*


分享到:
评论

相关推荐

Global site tag (gtag.js) - Google Analytics