Data Analysis Example

Sunday, 2 October 2016

All about Greenplum Partitioning - The system catalog tables at your help

I was trying to improve and create ways to identify the partitioning mechanism of the Greenplum tables.

Please see the queries below that will help you find all the partitioning details about a Greenplum table
.
In the next post, we will see how to leverage this information and automate some of your Data Engineering Activities using Python. Finally, we will create a pipeline to move the data to HDFS using Apache Sqoop.

DROP TABLE db_retail.sales ;
CREATE TABLE db_retail.sales (id int, date date, amt decimal(10,2))
DISTRIBUTED BY (id)
PARTITION BY RANGE (date)
( START (date '2008-01-01') INCLUSIVE
END (date '2009-01-01') EXCLUSIVE every (interval '1 mon')
,Start (date '2009-01-01') inclusive
end (date '2010-01-01') exclusive
EVERY (INTERVAL '1 day') );

drop table db_retail.sales_amt;
CREATE TABLE db_retail.sales_amt (id int, date date, amt decimal(10,2))
DISTRIBUTED BY (id)
PARTITION BY RANGE (amt)
( START (1) INCLUSIVE
END (10000) EXCLUSIVE every (10)
);

--- check all the partitions for the table
select * from pg_partitions where schemaname = 'db_retail' and tablename = 'sales';

select * from pg_partitions where schemaname = 'db_retail' and tablename = 'sales_amt';

--- query to analyze each inherited partition table

select partitiontablename from pg_partitions where schemaname = 'db_retail' and tablename = 'sales';

select * from pg_inherits
join pg_class c on c.oid = inhrelid
where inhparent = 'sales'::regclass;

---fetch interval wise max and min ranges for each partition

select min(partitionrangestart),max(partitionrangeend)
, current_date
, partitioneveryclause
, partitiontype
from pg_partitions where schemaname = 'db_retail' and tablename = 'sales'
group by 3,4,5;

select * from information_schema.tables limit 10;

--- recreate the table with the desired partition and column
select column_name, data_type, character_maximum_length
from INFORMATION_SCHEMA.COLUMNS where table_name = 'sales' and table_schema = 'db_retail';

--- find the partition column list
select columnname, position_in_partition_key from pg_partition_columns where schemaname = 'db_retail' and tablename = 'sales';

Sunday, 15 May 2016

HortonWorks Sandbox Hive error - How to resolve the issue

If you have recently downloaded the HDP vm and tried to launch HIVE, you must have encountered the below error:

[root@sandbox ~]# hive
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/hdp/2.3.2.0-2950/hadoop/lib/slf4j-log4j12 -1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/hdp/2.3.2.0-2950/spark/lib/spark-assembly -1.4.1.2.3.2.0-2950-hadoop2.7.1.2.3.2.0-2950.jar!/org/slf4j/impl/StaticLoggerBin der.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
WARNING: Use "yarn jar" to launch YARN applications.
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/hdp/2.3.2.0-2950/hadoop/lib/slf4j-log4j12 -1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/hdp/2.3.2.0-2950/spark/lib/spark-assembly -1.4.1.2.3.2.0-2950-hadoop2.7.1.2.3.2.0-2950.jar!/org/slf4j/impl/StaticLoggerBin der.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]

Logging initialized using configuration in file:/etc/hive/2.3.2.0-2950/0/hive-lo g4j.properties
Exception in thread "main" java.lang.RuntimeException: org.apache.hadoop.securit y.AccessControlException: Permission denied: user=root, access=WRITE, inode="/us er/root":hdfs:hdfs:drwxr-xr-x
at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.check(FSPe rmissionChecker.java:319)
at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.check(FSPe rmissionChecker.java:292)
at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.checkPermi ssion(FSPermissionChecker.java:213)
at org.apache.ranger.authorization.hadoop.RangerHdfsAuthorizer$RangerAcc essControlEnforcer.checkPermission(RangerHdfsAuthorizer.java:300)
at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.checkPermi ssion(FSPermissionChecker.java:190)
at org.apache.hadoop.hdfs.server.namenode.FSDirectory.checkPermission(FS Directory.java:1771)

The reason is the permission of the hdfs folder /user which is drwxr-xr-x

You cannot change it.

Directory permission in HDFS

[root@sandbox ~]# hadoop fs -mkdir /user/root
mkdir: Permission denied: user=root, access=WRITE, inode="/user/root":hdfs:hdfs:drwxr-xr-x
[root@sandbox ~]# who am i
root pts/0 2016-05-15 07:50 (192.168.10.1)
[root@sandbox ~]# hadoop fs -chmod 777 /user
chmod: changing permissions of '/user': Permission denied. user=root is not the owner of inode=user

The solution is to change the user:

Change User and login to Hive - Successful login

Now, you are connected to Hive.

hive> show databases;
OK
default
xademo
Time taken: 1.32 seconds, Fetched: 2 row(s)
hive>

Like us on Facebook and Google if liked the post. We provide Live Support for your project. Just fill the Inquiry form on the home page.

Or click on the link and fill your details: https://www.surveymonkey.com/r/zz962jr

Thursday, 21 April 2016

Basic Operations on Arrays - Python vs. Java

Below , we have listed the difference between array operations between Python and Java.

Python	Java
Python Array Declaration of 2 integers	Java Array Decalaration of 2 integers
>>> arr = [0] * 2 >>> arr[0] 0 >>> arr[0] = 10 >>> arr[1] = 2 >>> arr[2] = 9 Traceback (most recent call last): File "<stdin>", line 1, in <module> IndexError: list assignment index out of range >>> arr.append(2) # this feature is not available in Java primitive arrays >>>	public class hello_world { public static void main(String[] args) { int[] arr = new int[2]; arr[0] = 10; arr[1] = 9; arr[2] = 5; System.out.println(arr[0] + " " + arr[1]); } } >> Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: 2 at com.dmac.analytics.spark.hello_world.main(hello_world.java:9)

We can append values to an existing array in python.	In Java, a traditional array has to be reassigned to a new array with larger space allocated.

Java also has a library that can use create a list that is mutable.

Read up on java.util.List() to know more about the java data structure.

Please let us know if you like our posts. Like us on Google+ and Facebook.

Thursday, 14 April 2016

Advanced structures - Dictionary of tuples in Python

Today, I want to show you how you can create a dictionary object with a dictionary as key and dictionary as value.

So, it is a Dict of Dict. The final object is stored as a tuple.

Remember the thumb rule that any key for a python dictionary must be hashable. So, we will use a feature called as frozendict in python.

Frozendict is hashable and can be used as a key in Python Dictionary.
You can use the below program effectively for searching for a key in Dictionary of Dictionaries in Python.

Let me know if you need any details. Like us on Google+ and Facebook if you like our posts.

import frozendict as util
from collections import defaultdict

### Create the first entry using hashable frozendict
match = util.frozendict({'switch': 1, 'dstmac': '00:00:00:00:00:01', 'srcmac': '00:00:00:00:00:01'})
match_array = tuple[match,60000,5]
count_bucket2 = dict(CountBucket1 = 140271057099664, CountBucket2 = 140271056501008)

### Create the second entry using hashable frozendict
match_entry = util.frozendict(switch= 5, dstmac= '00:00:00:00:00:00', srcmac= '00:00:00:00:00:01')
match_array1 = tuple([match_entry, 59999, 7])

count_bucket1 = dict(CountBucket1 = 140271056467472, CountBucket2 = 140271056411280)

# Initialize the dictionary of tuples
dict_of_tuples = ({tuple(match_array) : count_bucket2},{tuple(match_array1) : count_bucket1})

####### Your match entry
match_entry = [{'switch': 1, 'dstmac': '00:00:00:00:00:01', 'srcmac': '00:00:00:00:00:01'},60000,5]

#Treating the final structure as a tuple. Each element of the tuple is a #dictionary.
k = 0
while k < len(dict_of_tuples):
key = dict_of_tuples[k].iterkeys()
val = dict_of_tuples[k].itervalues()
if key.next() == tuple(match_entry):
print ('Has key','corresponding value',val.next())
else:
print "Key not present"
k+= 1

Wednesday, 6 April 2016

Binary Tree implementation using Python - Python recursion usage

Below is a complete implementation of BST using Python. A BST(Binary Search Tree has the below properties)

1. Each parent can have max. 2 child
2. Left child is always smaller than parent
3. Right child is always greater than parent.

## Implementation of Binary tree in python
# this is an unbalanced binary tree, so this may lead to skewness in data

class Node():
def __init__(self,val,parent= None):
self.val = val
self.parent = parent
self.left = None
self.right = None
class Tree():
def __init__(self):
self.root = None
def getRoot(self):
return self.root
def add(self,val):
if self.root == None:
self.root = Node(val)
else:
self._add(val,self.root)
def _add(self,val,node):
if val < node.val:
if node.left is not None:
self._add(val,node.left)
else:
node.left = Node(val,node)
else:
if node.right is not None:
self._add(val,node.right)
else:
node.right = Node(val,node)

def find(self, val):
# Search complexity when balanced tree is O(logn)
if self.root is not None:
return self._find(val,self.root)
else:
return None
def _find(self,val,node):
#print val,node.val
if val == node.val:
#print "matched"
return node
elif val < node.val and node.left is not None:
return self._find(val,node.left)
elif val > node.val and node.right is not None:
return self._find(val,node.right)
else:
print val,"value not found"
return None
def deleteTree(self):
self.root = None

def delete_node(self,val):
node = self.find(val)
if node is None:
return None
else:
self._delete_node(val,node)

def _delete_node(self,val,node):
if node == self.root:
self.root = None
else:
# Case 1 when the node is a left node and a right node exists
#if node == node.parent.left and node.left is not None:
if node.left is not None:
replacement = self.find(self.maximum(node.left.val))
node.parent.left = replacement
self.delete_node(replacement.val)
#elif node == node.parent.right and node.right is not None:
elif node.right is not None:
r_val = node.right
print "right node val",r_val.val
print "minimum", self.minimum(r_val.val)
replacement = self.find(self.minimum(node.right.val))
# recursively delete the replacement from original position
self.delete_node(replacement.val)
node.parent.right = replacement
replacement.parent = node.parent
print "new right child",node.parent.right.val

#removing all links for deleted node
node.parent = None
node.left = None
node.right = None
elif node.left is None and node.right is None:
# Case 3 : Its a leaf node
node.parent = None

def printTree(self):
if self.root is not None:
self._printTree(self.root)

# The printTree is a inorder tree walk
def _printTree(self, node):
#print "current node",node.val
if node is not None:
#print "called by",node.val
self._printTree(node.left)
print str(node.val) + ' '
# print "right child of",node.val,node.right.val
self._printTree(node.right)

def minimum(self,val):
node = self.find(val)
#print node.val
#minval = 0
if node is not None:
#print "recursive call"
return self._minimum(val,node)
#print "came back from child process",minval
else:
#"in final else"
return node
def _minimum(self,val,node):
minval = 0
if node.left is not None:
#print "parent node value",node.val, node.left.val
return self._minimum(val,node.left)
else:
#print "In else", "parent node value",node.val
minval = node.val
#print "minval",minval
return minval
#print "outside now",minval
def maximum(self,val):
#print "find max"
node = self.find(val)
if node.right is not None:
return self._maximum(val,node)
else:
#print "no right node",node.val
return node.val
def _maximum(self,val,node):
if node.right is not None:
return self._maximum(val,node.right)
else:
return node.val
def successor(self,val):
node = self.find(val)
if node is None:
return None
elif node is not None and node.right is not None:
return self._minimum(val,node.right)
elif node.right is None:
return self._successor(val,node,node.parent)
def _successor(self,val,node,parent):

if parent is None:
return None
elif parent.left is None :
return self._successor(parent.val,parent,parent.parent)
elif parent.left == node :
#print "matched with left node"
return parent.val
elif parent.left <> node:
return self._successor(parent.val,parent,parent.parent)
def predecessor(self,val):
node = self.find(val)
if node is None:
return None
elif node.left is not None:
return self._maximum(node.left.val,node.left)
elif node.left is None and node <> node.parent.left:
return node.parent.val
else:
return None

#def transplant(T,u,v):

def level_order_traversal(self,key=None):
from copy import deepcopy
if key is None:
node = self.root
else:
node = Node(key)
p_node = [node]
level = [0]
i = 0
for x in p_node:
if x is not None:
if x.parent is not None:
#print "chekcing the traversal",x.val,"parent",x.parent.val
idx = p_node.index(x.parent) + 1
#print "parent info",x.parent.val,idx
nxtlevel = idx + 1
else:
idx = 0
nxtlevel = 1
print "at root", idx, p_node[idx].val
temp = []
if x.left is not None:
#print "left node exists"
p_node.append(x.left)
level.append(nxtlevel)
if x.right is not None:
#print "right node exists","nextlevel",nxtlevel
p_node.append(x.right)
level.append(nxtlevel)
# i keeps track of the element position in p_node
#print "index",i, "level[i]",level[i],"curr node",x.val
print level
try:
if level[i] == level[i+1]:
#print "append"
temp.append(x.val)
else:
temp.append(x.val)
print [element for element in temp]
temp = []
except IndexError:
temp.append(x.val)
print [element for element in temp]
temp = []
print "no more elements"
i += 1
def pre_order_traversal(self):
if self.root is not None:
self._pre_order_traversal(self.root)

# The printTree is a inorder tree walk
def _pre_order_traversal(self, node):
#print "current node",node.val
if node is not None:
#print "called by",node.val
print str(node.val) + ' '
self._pre_order_traversal(node.left)

# print "right child of",node.val,node.right.val
self._pre_order_traversal(node.right)

""""
the graph is as below

3
/ \
0 4
/ \ \
-1 2 8
/ \
1 2.5

"""

tree = Tree()
tree.add(3)
tree.add(4)
tree.add(0)
tree.add(-1)
tree.add(8)
tree.add(2)
tree.add(2.5)
tree.add(1)
tree.printTree()
print tree.root.val
#print (tree.find(-1)).val
print "find 2",(tree.find(2)).parent.val
#print tree.minimum(4)
#print tree.maximum(0)
print "predecessor",tree.predecessor(-1)
tree.delete_node(4)
#tree.printTree()
tree.level_order_traversal()
tree.find(100)
tree.pre_order_traversal()
#print "successor",tree.successor(8)