Sunday, 2 October 2016

All about Greenplum Partitioning - The system catalog tables at your help

I was trying to improve and create ways to identify the partitioning mechanism of the Greenplum tables.

Please see the queries below that will help you find all the partitioning details about a Greenplum table

In the next post, we will see how to leverage this information and automate some of your Data Engineering Activities using Python. Finally, we will create a pipeline to move the data to HDFS using Apache Sqoop.


DROP TABLE db_retail.sales ;
CREATE TABLE db_retail.sales (id int, date date, amt decimal(10,2))
DISTRIBUTED BY (id)
PARTITION BY RANGE (date)
( START (date '2008-01-01') INCLUSIVE
   END (date '2009-01-01') EXCLUSIVE every (interval '1 mon')
  ,Start (date '2009-01-01') inclusive
  end (date '2010-01-01') exclusive
   EVERY (INTERVAL '1 day') );

drop table db_retail.sales_amt;
CREATE TABLE db_retail.sales_amt (id int, date date, amt decimal(10,2))
DISTRIBUTED BY (id)
PARTITION BY RANGE (amt)
( START (1) INCLUSIVE
   END (10000) EXCLUSIVE every (10)
 );

--- check all the partitions for the table
select * from pg_partitions where schemaname = 'db_retail' and tablename = 'sales';

select * from pg_partitions where schemaname = 'db_retail' and tablename = 'sales_amt';


--- query to analyze each inherited partition table

select partitiontablename from pg_partitions where schemaname = 'db_retail' and tablename = 'sales';

select * from pg_inherits
join pg_class c on c.oid = inhrelid
where inhparent = 'sales'::regclass;

---fetch interval wise max and min ranges for each partition

select min(partitionrangestart),max(partitionrangeend)
, current_date 
, partitioneveryclause
, partitiontype 
from pg_partitions where schemaname = 'db_retail' and tablename = 'sales'
group by 3,4,5;

select * from information_schema.tables limit 10;

--- recreate the table with the desired partition and column
select column_name, data_type, character_maximum_length
from INFORMATION_SCHEMA.COLUMNS where table_name = 'sales' and table_schema = 'db_retail';

--- find the partition column list
select columnname, position_in_partition_key from pg_partition_columns where schemaname = 'db_retail' and tablename = 'sales';


Sunday, 15 May 2016

HortonWorks Sandbox Hive error - How to resolve the issue

If you have recently downloaded the HDP vm and tried to launch HIVE, you must have encountered the below error:

[root@sandbox ~]# hive
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/hdp/2.3.2.0-2950/hadoop/lib/slf4j-log4j12                                                                                                             -1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/hdp/2.3.2.0-2950/spark/lib/spark-assembly                                                                                                             -1.4.1.2.3.2.0-2950-hadoop2.7.1.2.3.2.0-2950.jar!/org/slf4j/impl/StaticLoggerBin                                                                                                             der.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
WARNING: Use "yarn jar" to launch YARN applications.
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/hdp/2.3.2.0-2950/hadoop/lib/slf4j-log4j12                                                                                                             -1.7.10.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/hdp/2.3.2.0-2950/spark/lib/spark-assembly                                                                                                             -1.4.1.2.3.2.0-2950-hadoop2.7.1.2.3.2.0-2950.jar!/org/slf4j/impl/StaticLoggerBin                                                                                                             der.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]

Logging initialized using configuration in file:/etc/hive/2.3.2.0-2950/0/hive-lo                                                                                                             g4j.properties

Exception in thread "main" java.lang.RuntimeException: org.apache.hadoop.securit                                                                                                             y.AccessControlException: Permission denied: user=root, access=WRITE, inode="/us                                                                                                             er/root":hdfs:hdfs:drwxr-xr-x
        at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.check(FSPe                                                                                                             rmissionChecker.java:319)
        at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.check(FSPe                                                                                                             rmissionChecker.java:292)
        at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.checkPermi                                                                                                             ssion(FSPermissionChecker.java:213)
        at org.apache.ranger.authorization.hadoop.RangerHdfsAuthorizer$RangerAcc                                                                                                             essControlEnforcer.checkPermission(RangerHdfsAuthorizer.java:300)
        at org.apache.hadoop.hdfs.server.namenode.FSPermissionChecker.checkPermi                                                                                                             ssion(FSPermissionChecker.java:190)
        at org.apache.hadoop.hdfs.server.namenode.FSDirectory.checkPermission(FS                                                                                                             Directory.java:1771)

The reason is the permission of the hdfs folder /user which is drwxr-xr-x


You cannot change it.



Directory permission in HDFS



[root@sandbox ~]# hadoop fs -mkdir /user/root
mkdir: Permission denied: user=root, access=WRITE, inode="/user/root":hdfs:hdfs:drwxr-xr-x
[root@sandbox ~]# who am i
root     pts/0        2016-05-15 07:50 (192.168.10.1)
[root@sandbox ~]# hadoop fs -chmod 777 /user
chmod: changing permissions of '/user': Permission denied. user=root is not the owner of inode=user

The solution is to change the user:

Change User and login to Hive - Successful login

Now, you are connected to Hive.

hive> show databases;
OK
default
xademo
Time taken: 1.32 seconds, Fetched: 2 row(s)
hive>

Like us on Facebook and Google if liked the post. We provide Live Support for your project. Just fill the Inquiry form on the home page.

Or click on the link and fill your details:   https://www.surveymonkey.com/r/zz962jr


Thursday, 21 April 2016

Basic Operations on Arrays - Python vs. Java


Below , we have listed the difference between array operations between Python and Java.
Python
Java
Python Array Declaration of 2 integers
Java Array Decalaration of 2 integers
>>> arr = [0] * 2
>>> arr[0]
0
>>> arr[0] = 10
>>> arr[1] = 2
>>> arr[2] = 9
Traceback (most recent call last):
  File "<stdin>", line 1, in <module>
IndexError: list assignment index out of range
>>> arr.append(2) # this feature is not available in Java primitive arrays
>>>
public class hello_world {

 public static void main(String[] args) {
  int[] arr = new int[2];
  arr[0] = 10;
  arr[1] = 9;
  arr[2] = 5;
  System.out.println(arr[0] + " " + arr[1]);
 
 }
}

>> Exception in thread "main" java.lang.ArrayIndexOutOfBoundsException: 2
 at com.dmac.analytics.spark.hello_world.main(hello_world.java:9)


We can append values to an existing array in python.
In Java, a traditional array has to be reassigned to a new array with larger space allocated.

Java also has a library that can use create a list that is mutable.

Read up on java.util.List() to know more about the java data structure.

Please let us know if you like our posts. Like us on Google+ and Facebook.  

Thursday, 14 April 2016

Advanced structures - Dictionary of tuples in Python

Today, I want to show you how you can create a dictionary object with a dictionary as key and dictionary as value.

So, it is a Dict of Dict. The final object is stored as a tuple.

Remember the thumb rule that any key for a python dictionary must be hashable. So, we will use a feature called as frozendict in python.

Frozendict is hashable and can be used as a key in Python Dictionary.
You can use the below program effectively for searching for a key in Dictionary of Dictionaries in Python.

Let me know if you need any details. Like us on Google+ and Facebook if you like our posts.

import frozendict as util
from collections import defaultdict

### Create the first entry using hashable frozendict
match = util.frozendict({'switch': 1, 'dstmac': '00:00:00:00:00:01', 'srcmac': '00:00:00:00:00:01'})
match_array = tuple[match,60000,5]
count_bucket2 = dict(CountBucket1 = 140271057099664, CountBucket2 = 140271056501008)

### Create the second entry using hashable frozendict
match_entry = util.frozendict(switch= 5, dstmac= '00:00:00:00:00:00', srcmac= '00:00:00:00:00:01')
match_array1 = tuple([match_entry, 59999, 7])

count_bucket1 = dict(CountBucket1 = 140271056467472, CountBucket2 = 140271056411280)

# Initialize the dictionary of tuples
dict_of_tuples = ({tuple(match_array) : count_bucket2},{tuple(match_array1) : count_bucket1})

####### Your match entry
match_entry = [{'switch': 1, 'dstmac': '00:00:00:00:00:01', 'srcmac': '00:00:00:00:00:01'},60000,5]                

#Treating the final structure as a tuple. Each element of the tuple is a #dictionary.
k = 0
while k < len(dict_of_tuples):
    key = dict_of_tuples[k].iterkeys()
    val = dict_of_tuples[k].itervalues()
    if key.next() == tuple(match_entry):
        print ('Has key','corresponding value',val.next())
    else:
        print "Key not present"
    k+= 1

Wednesday, 6 April 2016

Binary Tree implementation using Python - Python recursion usage

Below is a complete implementation of BST using Python. A BST(Binary Search Tree has the below properties)

1. Each parent can have max. 2 child
2. Left child is always smaller than parent
3. Right child is always greater than parent.



## Implementation of Binary tree in python
# this is an unbalanced binary tree, so this may lead to skewness in data

class Node():
    def __init__(self,val,parent= None):
        self.val = val
        self.parent = parent
        self.left = None
        self.right = None
class Tree():
    def __init__(self):
        self.root = None
    def getRoot(self):
        return self.root
    def add(self,val):
        if self.root == None:
            self.root = Node(val)
        else:
            self._add(val,self.root)
    def _add(self,val,node):
        if val < node.val:
            if node.left is not None:
                self._add(val,node.left)
            else:
                node.left = Node(val,node)
        else:
            if node.right is not None:
                self._add(val,node.right)
            else:
                node.right = Node(val,node)
   
    def find(self, val):
        # Search complexity when balanced tree is O(logn)
        if self.root is not None:
            return self._find(val,self.root)
        else:
            return None
    def _find(self,val,node):
        #print val,node.val
        if val == node.val:
            #print "matched"
            return node
        elif val < node.val and node.left is not None:
            return self._find(val,node.left)
        elif val > node.val and node.right is not None:
            return self._find(val,node.right)
        else:
            print val,"value not found"
            return None
    def deleteTree(self):
        self.root = None
       
    def delete_node(self,val):
        node = self.find(val)
        if node is None:
            return None
        else:
            self._delete_node(val,node)
   
    def _delete_node(self,val,node):
        if node == self.root:
            self.root = None
        else:
            # Case 1 when the node is a left node and a right node exists
            #if node == node.parent.left and node.left is not None:
            if node.left is not None:
                replacement = self.find(self.maximum(node.left.val))
                node.parent.left = replacement
                self.delete_node(replacement.val)
            #elif node == node.parent.right and node.right is not None:
            elif node.right is not None:
                r_val = node.right
                print "right node val",r_val.val
                print "minimum", self.minimum(r_val.val)
                replacement = self.find(self.minimum(node.right.val))
                # recursively delete the replacement from original position
                self.delete_node(replacement.val)
                node.parent.right = replacement
                replacement.parent = node.parent
                print "new right child",node.parent.right.val
               
                #removing all links for deleted node
                node.parent = None
                node.left = None
                node.right = None
            elif node.left is None and node.right is None:
                # Case 3 : Its a leaf node
                node.parent = None
               
   
   
   
    def printTree(self):
        if self.root is not None:
            self._printTree(self.root)
   
    # The printTree is a inorder tree walk
    def _printTree(self, node):
        #print "current node",node.val
        if node is not None:
            #print "called by",node.val
            self._printTree(node.left)
            print str(node.val) + ' '
           # print "right child of",node.val,node.right.val
            self._printTree(node.right)
           
    def minimum(self,val):
        node = self.find(val)
        #print node.val
        #minval = 0
        if node is not None:
            #print "recursive call"
            return self._minimum(val,node)
            #print "came back from child process",minval
        else:
            #"in final else"
            return node
    def _minimum(self,val,node):
        minval = 0
        if node.left is not None:
            #print  "parent node value",node.val, node.left.val
            return self._minimum(val,node.left)
        else:
            #print "In else", "parent node value",node.val
            minval = node.val
            #print "minval",minval
            return minval
        #print "outside now",minval
    def maximum(self,val):
        #print "find max"
        node = self.find(val)
        if node.right is not None:
            return self._maximum(val,node)
        else:
            #print "no right node",node.val
            return node.val
    def _maximum(self,val,node):
        if node.right is not None:
            return self._maximum(val,node.right)
        else:
            return node.val
    def successor(self,val):
        node = self.find(val)
        if node is None:
            return None
        elif node is not None and node.right is not None:
            return self._minimum(val,node.right)
        elif node.right is None:
            return self._successor(val,node,node.parent)
    def _successor(self,val,node,parent):
       
        if parent is None:
            return None
        elif parent.left is None :
            return self._successor(parent.val,parent,parent.parent)
        elif parent.left == node :
            #print "matched with left node"
            return parent.val
        elif parent.left <> node:
            return self._successor(parent.val,parent,parent.parent)
    def predecessor(self,val):
        node = self.find(val)
        if node is None:
            return None
        elif node.left is not None:
            return self._maximum(node.left.val,node.left)
        elif node.left is None and node <> node.parent.left:
            return node.parent.val
        else:
            return None
           
    #def transplant(T,u,v):
   
    def level_order_traversal(self,key=None):
        from copy import deepcopy
        if key is None:
            node = self.root
        else:
            node = Node(key)
        p_node = [node]
        level = [0]
        i = 0
        for x in p_node:
            if x is not None:
                if x.parent is not None:
                    #print "chekcing the traversal",x.val,"parent",x.parent.val
                    idx = p_node.index(x.parent) + 1
                    #print "parent info",x.parent.val,idx
                    nxtlevel = idx + 1
                else:
                    idx = 0
                    nxtlevel = 1
                    print "at root", idx, p_node[idx].val
                    temp = []
                if x.left is not None:
                    #print "left node exists"
                    p_node.append(x.left)
                    level.append(nxtlevel)
                if x.right is not None:
                    #print "right node exists","nextlevel",nxtlevel
                    p_node.append(x.right)
                    level.append(nxtlevel)
                # i keeps track of the element position in p_node
                #print "index",i, "level[i]",level[i],"curr node",x.val
                print level
                try:
                    if level[i] == level[i+1]:
                        #print "append"
                        temp.append(x.val)
                    else:
                            temp.append(x.val)
                            print [element for element in temp]
                            temp = []
                except IndexError:
                    temp.append(x.val)
                    print [element for element in temp]
                    temp = []
                    print "no more elements"
            i += 1
    def pre_order_traversal(self):
        if self.root is not None:
            self._pre_order_traversal(self.root)
   
    # The printTree is a inorder tree walk
    def _pre_order_traversal(self, node):
        #print "current node",node.val
        if node is not None:
            #print "called by",node.val
            print str(node.val) + ' '
            self._pre_order_traversal(node.left)
           
           # print "right child of",node.val,node.right.val
            self._pre_order_traversal(node.right)

""""
the graph is as below

            3
           / \
          0   4
         / \   \
       -1  2    8
          / \
         1   2.5
       
"""

tree = Tree()
tree.add(3)
tree.add(4)
tree.add(0)
tree.add(-1)
tree.add(8)
tree.add(2)
tree.add(2.5)
tree.add(1)
tree.printTree()
print tree.root.val
#print (tree.find(-1)).val
print "find 2",(tree.find(2)).parent.val
#print tree.minimum(4)
#print tree.maximum(0)
print "predecessor",tree.predecessor(-1)
tree.delete_node(4)
#tree.printTree()
tree.level_order_traversal()
tree.find(100)
tree.pre_order_traversal()
#print "successor",tree.successor(8)