Bhupendra Mishra

Thursday, November 12, 2015

Network Graph

Network Data Analysis: Online University

Load the dataset called studentNetwork.RData. Read the datasets negative-words.txt and positive-words.txt into R. Here is a link: studentNetwork.RDataView in a new window It is also in the Files section

#dir <- "C:/Users/Bhupendra Mishra/Desktop/donotbackup/"
#setwd(dir)
load("C:/Users/Bhupendra Mishra/Desktop/donotbackup/studentNetwork.RData")
#load("studentNetwork.RData")

Make a plot of the network

library(igraph)

## Warning: package 'igraph' was built under R version 3.2.2

## 
## Attaching package: 'igraph'
## 
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## 
## The following object is masked from 'package:base':
## 
##     union

library(statnet)

## Warning: package 'statnet' was built under R version 3.2.2

## Loading required package: network

## Warning: package 'network' was built under R version 3.2.2

## network: Classes for Relational Data
## Version 1.13.0 created on 2015-08-31.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
##                     Mark S. Handcock, University of California -- Los Angeles
##                     David R. Hunter, Penn State University
##                     Martina Morris, University of Washington
##                     Skye Bender-deMoll, University of Washington
##  For citation information, type citation("network").
##  Type help("network-package") to get started.
## 
## 
## Attaching package: 'network'
## 
## The following objects are masked from 'package:igraph':
## 
##     %c%, %s%, add.edges, add.vertices, delete.edges,
##     delete.vertices, get.edge.attribute, get.edges,
##     get.vertex.attribute, is.bipartite, is.directed,
##     list.edge.attributes, list.vertex.attributes,
##     set.edge.attribute, set.vertex.attribute
## 
## Loading required package: networkDynamic

## Warning: package 'networkDynamic' was built under R version 3.2.2

## 
## networkDynamic: version 0.8.1, created on 2015-10-06
## Copyright (c) 2015, Carter T. Butts, University of California -- Irvine
##                     Ayn Leslie-Cook, University of Washington
##                     Pavel N. Krivitsky, University of Wollongong
##                     Skye Bender-deMoll, University of Washington
##                     with contributions from
##                     Zack Almquist, University of California -- Irvine
##                     David R. Hunter, Penn State University
##                     Li Wang
##                     Kirk Li, University of Washington
##                     Steven M. Goodreau, University of Washington
##                     Jeffrey Horner
##                     Martina Morris, University of Washington
## Based on "statnet" project software (statnet.org).
## For license and citation information see statnet.org/attribution
## or type citation("networkDynamic").
## 
## Loading required package: ergm

## Warning: package 'ergm' was built under R version 3.2.2

## Loading required package: statnet.common

## Warning: package 'statnet.common' was built under R version 3.2.2

## 
## ergm: version 3.5.1, created on 2015-10-18
## Copyright (c) 2015, Mark S. Handcock, University of California -- Los Angeles
##                     David R. Hunter, Penn State University
##                     Carter T. Butts, University of California -- Irvine
##                     Steven M. Goodreau, University of Washington
##                     Pavel N. Krivitsky, University of Wollongong
##                     Martina Morris, University of Washington
##                     with contributions from
##                     Li Wang
##                     Kirk Li, University of Washington
##                     Skye Bender-deMoll, University of Washington
## Based on "statnet" project software (statnet.org).
## For license and citation information see statnet.org/attribution
## or type citation("ergm").
## 
## NOTE: If you use custom ERGM terms based on 'ergm.userterms'
## version prior to 3.1, you will need to perform a one-time update
## of the package boilerplate files (the files that you did not write
## or modify) from 'ergm.userterms' 3.1 or later. See
## help('eut-upgrade') for instructions.
## 
## Loading required package: sna

## Warning: package 'sna' was built under R version 3.2.2

## sna: Tools for Social Network Analysis
## Version 2.3-2 created on 2014-01-13.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
##  For citation information, type citation("sna").
##  Type help(package="sna") to get started.
## 
## 
## Attaching package: 'sna'
## 
## The following object is masked from 'package:network':
## 
##     %c%
## 
## The following objects are masked from 'package:igraph':
## 
##     %c%, betweenness, bonpow, closeness, components, degree,
##     dyad.census, evcent, hierarchy, is.connected, neighborhood,
##     triad.census
## 
## Loading required package: tergm

## Warning: package 'tergm' was built under R version 3.2.2

## 
## tergm: version 3.3.1, created on 2015-10-25
## Copyright (c) 2015, Pavel N. Krivitsky, University of Wollongong
##                     Mark S. Handcock, University of California -- Los Angeles
##                     with contributions from
##                     David R. Hunter, Penn State University
##                     Steven M. Goodreau, University of Washington
##                     Martina Morris, University of Washington
##                     Nicole Bohme Carnegie, New York University
##                     Carter T. Butts, University of California -- Irvine
##                     Ayn Leslie-Cook, University of Washington
##                     Skye Bender-deMoll
##                     Li Wang
##                     Kirk Li, University of Washington
## Based on "statnet" project software (statnet.org).
## For license and citation information see statnet.org/attribution
## or type citation("tergm").
## 
## Loading required package: ergm.count

## Warning: package 'ergm.count' was built under R version 3.2.2

## 
## ergm.count: version 3.2.0, created on 2015-06-18
## Copyright (c) 2015, Pavel N. Krivitsky, University of Wollongong
##                     with contributions from
##                     Mark S. Handcock, University of California -- Los Angeles
##                     David R. Hunter, Penn State University
## Based on "statnet" project software (statnet.org).
## For license and citation information see statnet.org/attribution
## or type citation("ergm.count").
## 
## NOTE: The form of the term 'CMP' has been changed in version 3.2
## of 'ergm.count'. See the news or help('CMP') for more information.
## 
## 
## statnet: version 2015.11.0, created on 2015-11-04
## Copyright (c) 2015, Mark S. Handcock, University of California -- Los Angeles
##                     David R. Hunter, Penn State University
##                     Carter T. Butts, University of California -- Irvine
##                     Steven M. Goodreau, University of Washington
##                     Pavel N. Krivitsky, University of Wollongong
##                     Skye Bender-deMoll
##                     Martina Morris, University of Washington
## Based on "statnet" project software (statnet.org).
## For license and citation information see statnet.org/attribution
## or type citation("statnet").
## 
## unable to reach CRAN

plot(studentNetwork, main = "Student NEtwork")

How many nodes and edges are there in studentNetwork?

summary(studentNetwork)

## Network attributes:
##   vertices = 205
##   directed = FALSE
##   hyper = FALSE
##   loops = FALSE
##   multiple = FALSE
##   bipartite = FALSE
##  total edges = 203 
##    missing edges = 0 
##    non-missing edges = 203 
##  density = 0.009708274 
## 
## Vertex attributes:
## 
##  Course_of_Study:
##    character valued attribute
##    attribute summary:
##          Business         Fine_Arts      Liberal_Arts Physical_Sciences 
##               109                 4                68                 6 
##        Technology 
##                18 
## 
##  Sex:
##    character valued attribute
##    attribute summary:
##   F   M 
##  99 106 
## 
##  StudentID:
##    integer valued attribute
##    205 values
## 
##  Tweets:
##    character valued attribute
##    attribute summary:
##    the 10 most common values are:
##                                                    abnormal|arbitrary|better-than-expected|dirt-cheap|foolish|lawful|lonesome|pretty|supremely|trump|unconditional|unthinkable 
##                                                                                                                                                                              1 
##                                       abominably|affably|benefit|enchant|enraptured|finagle|fugitive|gleeful|ingenious|nourish|premier|priceless|rapturously|vexingly|wasteful 
##                                                                                                                                                                              1 
##                abominate|adventuresome|affluent|blatantly|conveniently|dummy-proof|hedonistic|idol|improvement|irking|laudable|refresh|rumbling|silent|sweetness|titillatingly 
##                                                                                                                                                                              1 
##                                                                                   abort|altruistically|barbarously|disgruntle|faith|imaginative|indebted|ingenious|unwatchable 
##                                                                                                                                                                              1 
## abound|amenable|anomalous|baffling|dominates|drab|enjoy|flawlessly|happily|humorous|illness|reaffirm|shiny|stupendously|taboo|thoughtfulness|treasure|well-being|well-educated 
##                                                                                                                                                                              1 
##                                                                          abound|clouding|comfortable|expansive|glorious|impartial|principled|reforming|statuesque|troubled|woo 
##                                                                                                                                                                              1 
##                                                                        absurdly|aspiration|brainwash|clear|ergonomical|eye-catch|immaculate|inevitable|nurturing|punk|rumbling 
##                                                                                                                                                                              1 
##                                                                                                       abundant|entranced|hoodwink|outperforms|regress|solemn|thriving|upseting 
##                                                                                                                                                                              1 
##                                              acclaimed|accomplishment|believable|boisterous|breach|flawlessly|fondness|frail|hooray|idolized|peerless|randomly|spew|temptingly 
##                                                                                                                                                                              1 
##                                                                                                       accolade|calming|calumniation|cure|effusively|offending|saint|stupendous 
##                                                                                                                                                                              1 
## 
##  Year:
##    numeric valued attribute
##    attribute summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   3.000   2.732   4.000   6.000 
## 
## No edge attributes
## 
## Network edgelist matrix:
##        [,1] [,2]
##   [1,]    1   25
##   [2,]    1   52
##   [3,]    1   58
##   [4,]    1   70
##   [5,]    1   87
##   [6,]    1   92
##   [7,]    1   96
##   [8,]    1  100
##   [9,]    1  110
##  [10,]    1  127
##  [11,]    1  151
##  [12,]    1  161
##  [13,]    1  174
##  [14,]    2   52
##  [15,]    2  100
##  [16,]    2  134
##  [17,]    2  190
##  [18,]    5  204
##  [19,]    8   30
##  [20,]    8  104
##  [21,]    8  160
##  [22,]    9   19
##  [23,]    9   54
##  [24,]    9  115
##  [25,]    9  205
##  [26,]   11   44
##  [27,]   11   74
##  [28,]   13   61
##  [29,]   13  153
##  [30,]   14  182
##  [31,]   15   22
##  [32,]   15   55
##  [33,]   15   76
##  [34,]   15  189
##  [35,]   16   40
##  [36,]   16  140
##  [37,]   17  122
##  [38,]   18   63
##  [39,]   18  129
##  [40,]   18  158
##  [41,]   18  195
##  [42,]   21   59
##  [43,]   21  102
##  [44,]   21  140
##  [45,]   22   55
##  [46,]   22   64
##  [47,]   22   76
##  [48,]   22  123
##  [49,]   22  189
##  [50,]   23   36
##  [51,]   25   43
##  [52,]   25   60
##  [53,]   25   77
##  [54,]   25   87
##  [55,]   25   92
##  [56,]   25  147
##  [57,]   27   68
##  [58,]   27   93
##  [59,]   29   51
##  [60,]   29   55
##  [61,]   30   54
##  [62,]   30  104
##  [63,]   30  160
##  [64,]   31  114
##  [65,]   31  185
##  [66,]   32  178
##  [67,]   33  140
##  [68,]   34  187
##  [69,]   34  200
##  [70,]   36   97
##  [71,]   36  167
##  [72,]   38  155
##  [73,]   43   60
##  [74,]   43   77
##  [75,]   44   74
##  [76,]   44  136
##  [77,]   47   74
##  [78,]   47   79
##  [79,]   47  102
##  [80,]   47  105
##  [81,]   47  139
##  [82,]   47  153
##  [83,]   47  189
##  [84,]   47  191
##  [85,]   47  201
##  [86,]   51   55
##  [87,]   51   61
##  [88,]   52  127
##  [89,]   52  190
##  [90,]   53   83
##  [91,]   53  136
##  [92,]   54  115
##  [93,]   55   61
##  [94,]   55   66
##  [95,]   55   86
##  [96,]   55  123
##  [97,]   55  157
##  [98,]   56   71
##  [99,]   56  129
## [100,]   57  133
## [101,]   58  149
## [102,]   59   65
## [103,]   59  104
## [104,]   63  198
## [105,]   64   66
## [106,]   64  123
## [107,]   64  139
## [108,]   64  157
## [109,]   65  104
## [110,]   66   82
## [111,]   66  157
## [112,]   70  158
## [113,]   70  195
## [114,]   74  136
## [115,]   74  176
## [116,]   75  187
## [117,]   75  204
## [118,]   78  137
## [119,]   79   99
## [120,]   79  108
## [121,]   79  164
## [122,]   79  173
## [123,]   81  131
## [124,]   83  136
## [125,]   87   88
## [126,]   87   92
## [127,]   87   96
## [128,]   87  110
## [129,]   87  127
## [130,]   87  156
## [131,]   87  179
## [132,]   87  183
## [133,]   88  183
## [134,]   89  111
## [135,]   89  131
## [136,]   90  117
## [137,]   91  187
## [138,]   92  110
## [139,]   96  100
## [140,]   96  110
## [141,]   96  137
## [142,]   96  150
## [143,]   96  179
## [144,]   98  192
## [145,]   99  164
## [146,]   99  173
## [147,]  100  137
## [148,]  100  150
## [149,]  101  108
## [150,]  102  189
## [151,]  102  201
## [152,]  103  128
## [153,]  103  141
## [154,]  104  160
## [155,]  105  139
## [156,]  108  173
## [157,]  109  121
## [158,]  109  142
## [159,]  110  134
## [160,]  112  185
## [161,]  114  138
## [162,]  114  185
## [163,]  115  144
## [164,]  123  139
## [165,]  123  157
## [166,]  123  178
## [167,]  123  189
## [168,]  124  142
## [169,]  124  160
## [170,]  124  161
## [171,]  125  204
## [172,]  127  150
## [173,]  127  151
## [174,]  129  158
## [175,]  132  150
## [176,]  132  185
## [177,]  134  196
## [178,]  136  202
## [179,]  137  179
## [180,]  138  180
## [181,]  138  185
## [182,]  139  189
## [183,]  139  193
## [184,]  140  160
## [185,]  140  194
## [186,]  142  160
## [187,]  146  192
## [188,]  148  194
## [189,]  149  165
## [190,]  149  186
## [191,]  153  170
## [192,]  158  195
## [193,]  160  166
## [194,]  160  194
## [195,]  161  190
## [196,]  164  173
## [197,]  165  178
## [198,]  165  187
## [199,]  165  199
## [200,]  179  196
## [201,]  181  182
## [202,]  183  190
## [203,]  189  191

Answer: We have total 205 nodes and 203 edges

What are the attributes of this network object? What do they each contain?

Answer: There are four attributes and they contains as follows: 1. Course_of_study(Business, Fine_Arts, Liberal_Arts, Physical_Science, Technology) 2. Sex(F, M) 3. StudentID(205 integer value) 4. Tweets(character values)
5, What proportion of the students are Studying Business? What proportion are in their 2nd Year?

symbol = c(6,5,3,7,4,2)
student.sex = studentNetwork%v%"Sex"
summary(student.sex)

##   F   M 
##  99 106

barplot(table(student.sex), main = "Sex of student", col=symbol)

student.course_study=studentNetwork%v%"Course_of_Study"
summary(student.course_study)

##          Business         Fine_Arts      Liberal_Arts Physical_Sciences 
##               109                 4                68                 6 
##        Technology 
##                18

barplot(table(student.course_study), main="course of study of student", col=symbol)

student.year=studentNetwork%v%"Year"
#View(student.year)
summary(as.character(student.year))

##  1  2  3  4  5  6 
## 62 40 42 25 24 12

barplot(table(student.year), main="Year of study of student", col=symbol)

Write a function that splits the pipe delimited string into a vector of single words, counts which ones are positive and which ones are negative, assigns a score of +1 for each positive word and -1 for each negative word, and sums them for a total score.

#Ceate legend for network graph
symbol.sex=c(4,12) [match(student.sex, c("M","F"))]
symbol.course_study=c(1,2,3,4,5)[match(student.course_study,c("Business", "Fine_Arts", "Liberal_Arts", "Physical_Sciences", "Technology"))]

plot(studentNetwork, vertex.sides = symbol.sex, vertex.rot = 45, vertex.cex = 2, vertex.col = symbol[student.year], edge.lwd = 2, cex.main = 1, displayisolates = TRUE, main = "Network Diagram - Student Year")

legend("bottomright", c("Year1", "Year2", "Year3", "Year4", "Year5", "Year6"), fill = symbol, cex=0.6)

Adjacency matrix: Netword of Nodes and their interconnection can be represented with adjacency matrix The Adjacency matrix of a finete graph G on n vertices is the n x n matrix where the non-diagonal entry a(ij) is the number #of edges from vertex i to vertex j, and the diagonal entry a(ij), depending on convention, is either once or twice the number of edges (loops) from vertex i to itself. Undirected graphs often use the latter convention of counting loops twrice, whereas directed graphs typically use the former convention. There exists a unique adjacency matrix for each isomorphism class of graphs and it is not the adjacency matrix of any other isomorphism class of graphs. In the special case of finite simple graph. The adjacency matrix is a (0,1)-matrix with zeroz on its diagonal. If the graph is undirected, the adjacency matrix is symmetric
Reference: https://en.wikipedia.org/wiki/Adjacency_matrix

#Creating Adjacency Matrix
student.matrix=studentNetwork[,]

Degree: A Node’s degree in an undirected network is defined as its number of edges to other nodes

student.degree<-degree(student.matrix)
student.degree

##   [1] 26  8  0  0  2  0  0  6  8  0  4  0  4  2  8  4  2  8  2  0  6 12  2
##  [24]  0 14  0  4  0  4  8  4  2  2  4  0  6  0  2  0  2  0  0  6  6  0  0
##  [47] 18  0  0  0  6  8  4  6 18  4  2  4  6  4  6  0  4 10  4  8  0  2  0
##  [70]  6  2  0  0 10  4  4  4  2 10  0  2  2  4  0  0  2 20  4  4  2  2  8
##  [93]  2  0  0 14  2  2  6 10  2  8  4 10  4  0  0  6  4 10  2  2  0  6  6
## [116]  0  2  0  0  0  2  2 14  6  2  0 10  2  6  0  4  4  2  6  0 10  8  6
## [139] 12 10  2  6  0  2  0  2  2  2  6  8  4  0  6  0  2  2  8  8  0 16  6
## [162]  0  0  6  8  2  2  0  0  2  0  0  8  2  0  2  0  6  8  2  2  4  6  0
## [185] 10  2  8  0 14  8  4  4  2  6  6  4  0  2  2  2  4  2  0  6  2

Plot a histogram of the scores. What does it indicate?

hist(student.degree, col=symbol, main="Distribution of Nodes' Degree", ylab="Number of Students", xlab="Numbder of Connections")

Betweenness: A deeper measure of network structure is obtained through betweenness. Betweenness is a centrality measure of a node/vertex within a graph Nodes that occur on many shortest paths between other nodes have heigher betweenness than those that do not

student.betweenness <- betweenness(student.matrix)

plot(student.betweenness, col="green", main="Betweenness Centrality", ylab="Betweenness")

n.words <- read.table("C:/Users/Bhupendra Mishra/Desktop/donotbackup//negative-words.txt", header=TRUE, quote="\"")
#View(n.words)
p.words <-read.table("C:/Users/Bhupendra Mishra/Desktop/donotbackup//positive-words.txt", header=TRUE, quote="\"")
#View(p.words)
library(parallel)
library(foreach)

## Warning: package 'foreach' was built under R version 3.2.2

student.tweets=studentNetwork%v%"Tweets"
#student.tweets
tweet.score <- foreach(i=1:205, .combine='rbind') %dopar% 
  {
   words<-unlist(strsplit(student.tweets[i], split ='\\|'))
 #  View(words)
   pos.matches = match(words, unlist(p.words))
  # View(pos.matches)
   neg.matches = match(words, unlist(n.words))
   #View(neg.matches)
   pos.matches = !is.na(pos.matches)
   neg.matches = !is.na(neg.matches)
   score = sum(pos.matches) - sum(neg.matches)
  }

## Warning: executing %dopar% sequentially: no parallel backend registered

hist(tweet.score, main="Sentiment analysis of the Students",xlab="Tweet Score",col=symbol)

#View(tweet.score)

Do the distribution between 2nd Year and 4th Year students look different? How about between those studying Business and those studying Technology?

#Histogram of Tweet Scores for year 1 and year 2 respectively
par(mfrow = c(1,2))
hist(tweet.score[student.year==2], main="Sentiment - Year 2", xlab="Tweet Score", col=symbol)

hist(tweet.score[student.year==4], main="Sentiment - Year 4", xlab="Tweet Score", col=symbol)

#Histogram of Tweets Scorces for Business and Technolgy Students respectively

hist(tweet.score[student.course_study=="Business"], main="Sentiment - Business", xlab="Tweet Score", col=symbol)

hist(tweet.score[student.course_study=="Technology"], main= "Sentiment - Technology", xlab="Tweet Score", col=symbol)

Friday, October 23, 2015

Data Governance and Agile Data warehouse Development

Agile Data Ware Development

Followings are key points

- Data governance council
- Data governance program
- Data governance model should be structured
- Correct polices, data definition and charter should be defined
- Representative from all department from the organisation should participate in defining data definition

A successful data science team be govern by following

- data strategy
- data stewardship
- data governance

Data story teller vs. Analyst

Monday, September 21, 2015

Hive

Hadoop Distributed File System(HDFS™) is the foundation of the Hadoop cluster. The HDFS file system manages how the datasets are stored in the Hadoop cluster. It is responsible for distributing the data across the datanodes, managing replication for redundancy and administrative tasks like adding, removing and recovery of datanodes

The Apache Hive project provides a data warehouse view of the data in HDFS. Using a SQL-like language Hive lets you create summarizations of your data, perform ad-hoc queries, and analysis of large datasets in the Hadoop cluster. The overall approach with Hive is to project a table structure on the dataset and then manipulate it with HiveQL. Since you are using data in HDFS your operations can be scaled across all the datanodes and you can manipulate huge datasets.

HIVE installation
Issue 1
hduser@ubuntu:~/apache-hive-1.2.1-bin/bin$ hive

Logging initialized using configuration in jar:file:/home/hduser/apache-hive-1.2.1-bin/lib/hive-common-1.2.1.jar!/hive-log4j.properties
[ERROR] Terminal initialization failed; falling back to unsupported
java.lang.IncompatibleClassChangeError: Found class jline.Terminal, but interface was expected
at jline.TerminalFactory.create(TerminalFactory.java:101)
at jline.TerminalFactory.get(TerminalFactory.java:158)
at jline.console.ConsoleReader.<init>(ConsoleReader.java:229)
at jline.console.ConsoleReader.<init>(ConsoleReader.java:221)
at jline.console.ConsoleReader.<init>(ConsoleReader.java:209)
at org.apache.hadoop.hive.cli.CliDriver.setupConsoleReader(CliDriver.java:787)
at org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:721)
at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:681)
at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:621)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.util.RunJar.run(RunJar.java:221)
at org.apache.hadoop.util.RunJar.main(RunJar.java:136)

Exception in thread "main" java.lang.IncompatibleClassChangeError: Found class jline.Terminal, but interface was expected
at jline.console.ConsoleReader.<init>(ConsoleReader.java:230)
at jline.console.ConsoleReader.<init>(ConsoleReader.java:221)
at jline.console.ConsoleReader.<init>(ConsoleReader.java:209)
at org.apache.hadoop.hive.cli.CliDriver.setupConsoleReader(CliDriver.java:787)
at org.apache.hadoop.hive.cli.CliDriver.executeDriver(CliDriver.java:721)
at org.apache.hadoop.hive.cli.CliDriver.run(CliDriver.java:681)
at org.apache.hadoop.hive.cli.CliDriver.main(CliDriver.java:621)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.hadoop.util.RunJar.run(RunJar.java:221)

at org.apache.hadoop.util.RunJar.main(RunJar.java:136)

FIX applied:remove jline-0.9.94.jar from the hadoop lib directory

hduser@ubuntu:/usr/local/hadoop-2.6.0/share/hadoop/yarn/lib$ mv jline-0.9.94.jar /home/hduser/

hduser@ubuntu:/usr/local/hadoop-2.6.0/share/hadoop/yarn/lib$ hive

Logging initialized using configuration in jar:file:/home/hduser/apache-hive-1.2.1-bin/lib/hive-common-1.2.1.jar!/hive-log4j.properties

hive>

Installation of derby

wget http://archive.apache.org/dist/db/derby/db-derby-10.4.2.0/db-derby-10.4.2.0-bin.tar.gz

Reference

https://cwiki.apache.org/confluence/display/Hive/GettingStarted

http://www.tutorialspoint.com/hive/hive_installation.htm

https://www.facebook.com/notes/facebook-engineering/join-optimization-in-apache-hive/470667928919

Granularity is the extent to which a material or system is composed of distinguishable pieces or grains It can either refer to the exten to which a larger entity is subdivided
Coarse-Grained material or system have fewer, larger discrete component than fine-grained material or system

Partition & Bucketing

Partition : PARTITION BY

Bucketting: CLUSTERED BY

Sample use case

Movielens user ratings

1. download data

wget http://files.grouplens.org/datasets/movielens/ml-100k.zip

2. create table

CREATE TABLE u_data (
  userid INT,
  movieid INT,
  rating INT,
  unixtime STRING)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t'
STORED AS TEXTFILE;

3. Unzip

unzip ml-100k.zip

4. Load data into table
hive> load data local inpath '/home/cloudera/ml-100k/u.data' overwrite into table u_data;
Copying data from file:/home/cloudera/ml-100k/u.data
Copying file: file:/home/cloudera/ml-100k/u.data
Loading data to table default.u_data
rmr: DEPRECATED: Please use 'rm -r' instead.
Moved: 'hdfs://localhost.localdomain:8020/user/hive/warehouse/u_data' to trash at: hdfs://localhost.localdomain:8020/user/cloudera/.Trash/Current
Table default.u_data stats: [num_partitions: 0, num_files: 1, num_rows: 0, total_size: 1979173, raw_data_size: 0]
OK
Time taken: 4.508 seconds




5. Find the no of recoards inside the table



hive> select count(*) from u_data;
Total MapReduce jobs = 1
Launching Job 1 out of 1
Number of reduce tasks determined at compile time: 1
In order to change the average load for a reducer (in bytes):
  set hive.exec.reducers.bytes.per.reducer=<number>
In order to limit the maximum number of reducers:
  set hive.exec.reducers.max=<number>
In order to set a constant number of reducers:
  set mapred.reduce.tasks=<number>
Starting Job = job_201509182138_0001, Tracking URL = http://0.0.0.0:50030/jobdetails.jsp?jobid=job_201509182138_0001
Kill Command = /usr/lib/hadoop/bin/hadoop job  -kill job_201509182138_0001
Hadoop job information for Stage-1: number of mappers: 1; number of reducers: 1
2015-09-19 07:06:42,926 Stage-1 map = 0%,  reduce = 0%
2015-09-19 07:06:58,579 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 3.87 sec
2015-09-19 07:06:59,711 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 3.87 sec
2015-09-19 07:07:00,784 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 3.87 sec
2015-09-19 07:07:01,843 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 3.87 sec
2015-09-19 07:07:02,869 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 3.87 sec
2015-09-19 07:07:03,876 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 3.87 sec
2015-09-19 07:07:04,884 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 3.87 sec
2015-09-19 07:07:05,894 Stage-1 map = 100%,  reduce = 0%, Cumulative CPU 3.87 sec
2015-09-19 07:07:06,903 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 5.04 sec
2015-09-19 07:07:07,913 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 5.04 sec
2015-09-19 07:07:08,948 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 5.04 sec
2015-09-19 07:07:09,979 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 5.04 sec
2015-09-19 07:07:10,999 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 5.04 sec
2015-09-19 07:07:12,021 Stage-1 map = 100%,  reduce = 100%, Cumulative CPU 5.04 sec
MapReduce Total cumulative CPU time: 5 seconds 40 msec
Ended Job = job_201509182138_0001
MapReduce Jobs Launched: 
Job 0: Map: 1  Reduce: 1   Cumulative CPU: 5.04 sec   HDFS Read: 1979394 HDFS Write: 7 SUCCESS
Total MapReduce CPU Time Spent: 5 seconds 40 msec
OK
100000
Time taken: 45.919 seconds








Lets start with little complex query




create a python script as follows:

vi weekday_mapper.py

import sys
import datetime

for line in sys.stdin:
  line = line.strip()
  userid, movieid, rating, unixtime = line.split('\t')
  weekday = datetime.datetime.fromtimestamp(float(unixtime)).isoweekday()
  print '\t'.join([userid, movieid, rating, str(weekday)])


create hive sql script
vi mapper.sql


CREATE TABLE u_data_new (
  userid INT,
  movieid INT,
  rating INT,
  weekday INT)
ROW FORMAT DELIMITED
FIELDS TERMINATED BY '\t';

add FILE weekday_mapper.py;

INSERT OVERWRITE TABLE u_data_new
SELECT
  TRANSFORM (userid, movieid, rating, unixtime)
  USING 'python weekday_mapper.py'
  AS (userid, movieid, rating, weekday)
FROM u_data;

SELECT weekday, COUNT(*)
FROM u_data_new
GROUP BY weekday;






run the hive scrip as follows:

[cloudera@localhost ml-100k]$ hive -f ./mapper.sql




************************************************************************************

Working on movie lense dataset




Download data

wget http://www.grouplens.org/system/files/ml-1m.zip



Unzip ml-1m.zip



data preparetion:



Change column seperation



[cloudera@localhost ml-1m]$ sed 's/::/#/g' ratings.dat > ratings.t
[cloudera@localhost ml-1m]$ sed 's/::/#/g' users.dat > users.t
[cloudera@localhost ml-1m]$ sed 's/::/#/g' movies.dat > movies.t






#Create a file as follows

vi occupations.t

0#other/not specified
1#academic/educator
2#artist
3#clerical/admin
4#college/grad student
5#customer service
6#doctor/health care
7#executive/managerial
8#farmer
9#homemaker
10#K-12 student
11#lawyer
12#programmer
13#retired
14#sales/marketing
15#scientist
16#self-employed
17#technician/engineer
18#tradesman/craftsman
19#unemployed
20#writer





#Create table as follows



#vi createtab.sql

create database movielens;

use movielens;

CREATE EXTERNAL TABLE ratings (
  userid INT, 
  movieid INT,
  rating INT, 
  tstamp STRING
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY '#'
STORED AS TEXTFILE
LOCATION '/dataset/movielens/ratings';

CREATE EXTERNAL TABLE movies (
  movieid INT, 
  title STRING,
  genres ARRAY<STRING>
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY '#'
COLLECTION ITEMS TERMINATED BY "|"
STORED AS TEXTFILE
LOCATION '/dataset/movielens/movies';

CREATE EXTERNAL TABLE users (
  userid INT, 
  gender STRING, 
  age INT,
  occupation INT,
  zipcode STRING
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY '#'
STORED AS TEXTFILE
LOCATION '/dataset/movielens/users';

CREATE EXTERNAL TABLE occupations (
  id INT,
  occupation STRING
) ROW FORMAT DELIMITED
FIELDS TERMINATED BY '#'
STORED AS TEXTFILE
LOCATION '/dataset/movielens/occupations';



$ hive -f ./createtab.sql





$load data into tables



hadoop fs -put ratings.t /dataset/movielens/ratings
hadoop fs -put movies.t /dataset/movielens/movies
hadoop fs -put users.t /dataset/movielens/users
hadoop fs -put occupations.t /dataset/movielens/occupations







$vi concate.sql

CREATE TABLE rating_full
as
select 
  r.*, 
  m.title as m_title,
  concat_ws('|',sort_array(m.genres)) as m_genres, 
  u.gender as u_gender,
  u.age as u_age,
  u.occupation as u_occupation,
  u.zipcode as u_zipcode
from
  ratings r 
  JOIN movies m ON (r.movieid = m.movieid)
  JOIN users u ON (r.userid = u.userid);




#hive -f ./concate.sql




hive> desc rating_full;




#creating a training/testing table such that each has 80/20 of the original data

vi traintest.sql

SET hivevar:seed=31; 

CREATE TABLE ratings2
as
select
  rand(${seed}) as rnd, 
  userid, 
  movieid, 
  rating
from 
  ratings; 


CREATE TABLE training
as
select * from ratings2
order by rnd DESC
limit 800000;

CREATE TABLE testing
as
select * from ratings2
order by rnd ASC
limit 200209;

#hive -f ./traintest.sql