Saturday, December 28, 2019

Simple golang webapp on docker for quick test

Golang App

mkdir golang-web-app
cd golang-web-app
vim main.go
package main

import (
	"fmt"
	"log"
	"net/http"
	"os"
	"time"
)

func main() {

	http.HandleFunc("/", func(w http.ResponseWriter, r *http.Request) {
		hostname, err := os.Hostname()
		if err != nil {
			panic(err)
		}

		fmt.Fprintf(w, "Hello from host: %q The current  time is %q", hostname,time.Now().String())
	})


	log.Fatal(http.ListenAndServe(":8080", nil))

}

Run and Unit test it

go run main.go
access the localhost:8080

Dockerfile

FROM golang:1.12.0-alpine3.9
RUN mkdir /app
ADD . /app
WORKDIR /app
RUN go build -o main .
CMD ["/app/main"]

Build Docker Image

docker build -t golang-web-app .
-- after above step done verify the status of docker image
docker images

Run Docker container

 docker run -p  8080:8080 -it golang-web-app

if everything goes well you can access the http://localhost:8080 endpoint.


Tuesday, August 20, 2019

Apache Hive Client (implemented using GoLang) to connect to hiveserver2

for more information on golang hive driver, please refer https://github.com/beltran/gohive

Sample Table
create table test (id int);
insert into table test values (1),(2),(3),(4),(5);
select * from test;
1
2
3
4
5

hiveclient.go

package main

import (
	"context"
	"fmt"
	"github.com/beltran/gohive"
	"log"
)

func main() {
	conf := gohive.NewConnectConfiguration()

	conn, err := gohive.Connect("hdp31ab",10500,"NONE",conf)
	if err != nil {
		log.Fatal("Error occured while getting the connection %v ",err)
	}
	defer conn.Close()

	cur := conn.Cursor()
	defer cur.Close()

	cur.Exec(context.Background(),"select * from test")

	if cur.Err != nil {
		log.Fatal("unable to get the result set from hive server : %v",cur.Err)
	}
	ctx := context.Background()
	var rowcell1 int32
	for cur.HasMore(ctx){
		cur.FetchOne(ctx, &rowcell1)
		fmt.Println(rowcell1)
	}

}

Run it!!!

go run hiveclient.go 
1
2
3
4
5

Thursday, July 25, 2019

Debugging HiveServer2 Docker container Remotely using Intellij

HiveServer2 Dockerfile

watch out the JAVA_TOOL_OPTIONS which are having remote debugging options.

FROM centos

# Basic hygene
RUN yum upgrade -y && \
    yum update -y && \
    yum install -y java-1.8.0-openjdk-devel wget sudo unzip git maven which

# Get Hadoop
RUN mkdir /grid && \
    cd /grid &&  \
    wget http://mirrors.ocf.berkeley.edu/apache/hadoop/common/hadoop-3.1.1/hadoop-3.1.1.tar.gz &&  \
    tar zxf hadoop-3.1.1.tar.gz

# Get Hive
RUN cd /grid &&  \
    wget http://mirrors.ocf.berkeley.edu/apache/hive/hive-3.1.1/apache-hive-3.1.1-bin.tar.gz &&  \
    tar zxf apache-hive-3.1.1-bin.tar.gz

RUN mkdir /root/hdfs-scratch && \
    mkdir -p /user/hive/warehouse && \
    mkdir /tmp/hive && \
    chmod 777 /tmp/hive


# Create Hive User
RUN groupadd hive && \
    useradd -g hive --shell=/bin/bash -m -d /home/hive hive && \
    chown -R hive /user/hive/ && \
    chgrp -R hive /user/hive/

USER hive

# Set up environment variables
ENV HIVE_HOME /grid/apache-hive-3.1.1-bin
ENV HADOOP_HOME /grid/hadoop-3.1.1
ENV PATH $PATH:$HADOOP_HOME/bin:$HIVE_HOME/bin
ENV JAVA_HOME /usr
ENV JAVA_TOOL_OPTIONS -agentlib:jdwp=transport=dt_socket,address=8000,server=y,suspend=n

EXPOSE 10000
WORKDIR /home/hive

CMD ["hive", "--service", "hiveserver2", "--hiveconf", "datanucleus.schema.autoCreateAll=true", "--hiveconf", "hive.metastore.schema.verification=false"]

BUILD docker container

docker build . -t hive3-image-debug

RUN docker container after exposing the debug port

// alternate way to forward env variable to docker container
docker run -it --net=myNetwork -p 10000:10000 -p 8000:8000 -e  "JAVA_TOOL_OPTIONS=\"-agentlib:jdwp=transport=dt_socket,address=8000,server=y,suspend=n\""  hive3-image

docker run -it --net=myNetwork -p 10000:10000 -p 8000:8000   hive3-image-debug

Create Remote debug profile and break point and you are good to go.


Collect Jstack,Jmap and other JVM diagnostics for a JVM running inside the docker container

RUN A hiveserver2 in docker containr (please follow the link http://rajkrrsingh.blogspot.com/2019/07/running-hiveserver2-on-docker.html)

Get the container id or container name for a running container.


docker ps -a
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
0749ae15ece8 hive3-image "hive --service hive…" 20 hours ago Up 20 hours 0.0.0.0:10000->10000/tcp jovial_swartz


now you can use container id or the container name to connect with container interactively

docker exec -it 0749ae15ece8 /bin/bash
[hive@0749ae15ece8 ~]$ pwd
/home/hive
[hive@0749ae15ece8 ~]$ jps
1 RunJar
810 Jps
[hive@0749ae15ece8 ~]$ ps aux | grep hiveserver
hive 1 1.7 21.0 2170608 431380 pts/0 Ssl+ 11:47 6:57 /usr/bin/java -Dproc_jar -Dproc_hiveserver2 -Dlog4j.configurationFile=hive-log4j2.properties -Djava.util.logging.config.file=/grid/apache-hive-3.1.1-bin/conf/parquet-logging.properties -Dyarn.log.dir=/grid/hadoop-3.1.1/logs -Dyarn.log.file=hadoop.log -Dyarn.home.dir=/grid/hadoop-3.1.1 -Dyarn.root.logger=INFO,console -Djava.library.path=/grid/hadoop-3.1.1/lib/native -Xmx256m -Dhadoop.log.dir=/grid/hadoop-3.1.1/logs -Dhadoop.log.file=hadoop.log -Dhadoop.home.dir=/grid/hadoop-3.1.1 -Dhadoop.id.str=hive -Dhadoop.root.logger=INFO,console -Dhadoop.policy.file=hadoop-policy.xml -Dhadoop.security.logger=INFO,NullAppender org.apache.hadoop.util.RunJar /grid/apache-hive-3.1.1-bin/lib/hive-service-3.1.1.jar org.apache.hive.service.server.HiveServer2 --hiveconf datanucleus.schema.autoCreateAll=true --hiveconf hive.metastore.schema.verification=false
hive 828 0.0 0.0 9096 876 pts/1 S+ 18:25 0:00 grep --color=auto hiveserver
[hive@0749ae15ece8 ~]$ jstack -l 1
......


jmap -histo:live 1

....


Wednesday, July 24, 2019

Running Hiveserver2 on docker

Create and run hiveserver2 docker container

mkdir hive-3-image
cd hive-3-image/
wget https://raw.githubusercontent.com/alanfgates/sqltest/master/dbs/hive/v3_1/Dockerfile
docker build . -t hive3-image
// Run it
docker run -it --net=myNetwork -p 10000:10000 hive3-image

You will see following logs on stdout if everything goes well

which: no hbase in (/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/grid/hadoop-3.1.1/bin:/grid/apache-hive-3.1.1-bin/bin)
2019-07-24 22:22:18: Starting HiveServer2
SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/grid/apache-hive-3.1.1-bin/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/grid/hadoop-3.1.1/share/hadoop/common/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Hive Session ID = 7ac431e6-9619-4cfc-a98d-00cb3cd6e9dc
Hive Session ID = cb5986e4-38dd-401c-aea6-d7f1032790ad
Hive Session ID = 525af947-9fc4-4445-a7c3-227ae861f513
Hive Session ID = ee73bf5e-4baa-420e-b409-55dabf67cf61

Now create and run beeline container

 mkdir beeline-docker
 wget https://raw.githubusercontent.com/alanfgates/sqltest/master/dbs/hive/v3_1/Dockerfile

I know this is a very naive approach but lets stick to it, get the hostname of hiveserver2 docker container using (by default container id is default hostname)

 docker ps -a | grep hive3-image
 CONTAINER ID        IMAGE                      COMMAND                  CREATED             STATUS                            PORTS                      NAMES
0749ae15ece8        hive3-image                "hive --service hive…"   2 minutes ago       Up 2 minutes                      0.0.0.0:10000->10000/tcp   jovial_swartz

// get hostname 
docker inspect 0749ae15ece8 | grep -i host
"HostnamePath": "/var/lib/docker/containers/0749ae15ece81ae9e429d19f28c9f0bed52f7837f6984f9619aab6618f97a5f4/hostname",
       "HostsPath": "/var/lib/docker/containers/0749ae15ece81ae9e429d19f28c9f0bed52f7837f6984f9619aab6618f97a5f4/hosts",
       "HostConfig": {
                       "HostIp": "",
                       "HostPort": "10000"
           "ExtraHosts": null,
           "Hostname": "0749ae15ece8",
                       "HostIp": "0.0.0.0",
                       "HostPort": "10000"

Modify the CMD in Dockerfile

CMD ["beeline","-u","jdbc:hive2://0749ae15ece8:10000"]

Build docker beeline image for this definition

docker build . -t beeline_docker

Run it

docker run -it --net=myNetwork beeline_docker

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/grid/apache-hive-3.1.1-bin/lib/log4j-slf4j-impl-2.10.0.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/grid/hadoop-3.1.1/share/hadoop/common/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.apache.logging.slf4j.Log4jLoggerFactory]
Connecting to jdbc:hive2://0749ae15ece8:10000
Connected to: Apache Hive (version 3.1.1)
Driver: Hive JDBC (version 3.1.1)
Transaction isolation: TRANSACTION_REPEATABLE_READ
Beeline version 3.1.1 by Apache Hive
0: jdbc:hive2://0749ae15ece8:10000> show tables;
INFO  : Compiling command(queryId=hive_20190724222711_18df4f09-d690-4774-af47-abe0c63bf8b2): show tables
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Semantic Analysis Completed (retrial = false)
INFO  : Returning Hive schema: Schema(fieldSchemas:[FieldSchema(name:tab_name, type:string, comment:from deserializer)], properties:null)
INFO  : Completed compiling command(queryId=hive_20190724222711_18df4f09-d690-4774-af47-abe0c63bf8b2); Time taken: 1.391 seconds
INFO  : Concurrency mode is disabled, not creating a lock manager
INFO  : Executing command(queryId=hive_20190724222711_18df4f09-d690-4774-af47-abe0c63bf8b2): show tables
INFO  : Starting task [Stage-0:DDL] in serial mode
INFO  : Completed executing command(queryId=hive_20190724222711_18df4f09-d690-4774-af47-abe0c63bf8b2); Time taken: 0.063 seconds
INFO  : OK
INFO  : Concurrency mode is disabled, not creating a lock manager
+-----------+
| tab_name  |
+-----------+
+-----------+
No rows selected (1.974 seconds)
view raw Hive_Docker.md hosted with ❤ by GitHub

Monday, March 25, 2019

Hive Kafka Integration

#### ENV: HDP-3.1
#### Data setup:
```
cat sample-data.json
{"name": "Raj","address": {"a": "b","c": "d","e": "f"}}
{"name": "Raj1","address": {"a": "bb","c": "dd","e": "ff"}}
```
#### Create topic in Kafka and Ingest data into it.
```
/usr/hdp/current/kafka-broker/bin/kafka-topics.sh --create --zookeeper `hostname`:2181 --replication-factor 1 --partitions 1 --topic dummytopic1
cat sample-data.json | /usr/hdp/current/kafka-broker/bin/kafka-console-producer.sh --broker-list `hostname -f`:6667 --topic dummytopic1
```
#### Create Hive table:
```
CREATE EXTERNAL TABLE kafka_test_table (name string, address struct<a:string,c:string,e:string>) STORED BY 'org.apache.hadoop.hive.kafka.KafkaStorageHandler' TBLPROPERTIES ("kafka.topic" = "dummytopic1" , "kafka.bootstrap.servers" = "hdp31a:6667" , "kafka.consumer.security.protocol" = "PLAINTEXT");
-- show create table
+----------------------------------------------------+
| createtab_stmt |
+----------------------------------------------------+
| CREATE EXTERNAL TABLE `kafka_test_table`( |
| `name` string COMMENT 'from deserializer', |
| `address` struct<a:string,c:string,e:string> COMMENT 'from deserializer', |
| `__key` binary COMMENT 'from deserializer', |
| `__partition` int COMMENT 'from deserializer', |
| `__offset` bigint COMMENT 'from deserializer', |
| `__timestamp` bigint COMMENT 'from deserializer') |
| ROW FORMAT SERDE |
| 'org.apache.hadoop.hive.kafka.KafkaSerDe' |
| STORED BY |
| 'org.apache.hadoop.hive.kafka.KafkaStorageHandler' |
| WITH SERDEPROPERTIES ( |
| 'serialization.format'='1') |
| LOCATION |
| 'hdfs://hdp31a.hdp.local:8020/warehouse/tablespace/external/hive/kafka_test_table' |
| TBLPROPERTIES ( |
| 'bucketing_version'='2', |
| 'hive.kafka.max.retries'='6', |
| 'hive.kafka.metadata.poll.timeout.ms'='30000', |
| 'hive.kafka.optimistic.commit'='false', |
| 'hive.kafka.poll.timeout.ms'='5000', |
| 'kafka.bootstrap.servers'='hdp31a:6667', |
| 'kafka.consumer.security.protocol'='PLAINTEXT', |
| 'kafka.serde.class'='org.apache.hadoop.hive.serde2.JsonSerDe', |
| 'kafka.topic'='dummytopic1', |
| 'kafka.write.semantic'='AT_LEAST_ONCE', |
| 'transient_lastDdlTime'='1553536265') |
+----------------------------------------------------+
```
#### Query table:
```
-- select * from kafka_test_table;
+------------------------+-------------------------------+-------------------------+-------------------------------+----------------------------+-------------------------------+
| kafka_test_table.name | kafka_test_table.address | kafka_test_table.__key | kafka_test_table.__partition | kafka_test_table.__offset | kafka_test_table.__timestamp |
+------------------------+-------------------------------+-------------------------+-------------------------------+----------------------------+-------------------------------+
| Raj | {"a":"b","c":"d","e":"f"} | NULL | 0 | 0 | 1553536022213 |
| Raj1 | {"a":"bb","c":"dd","e":"ff"} | NULL | 0 | 1 | 1553536464796 |
+------------------------+-------------------------------+-------------------------+-------------------------------+----------------------------+-------------------------------+
2 rows selected (0.549 seconds)
-- select name,address.a,address.c,address.e from kafka_test_table;
+-------+-----+-----+-----+
| name | a | c | e |
+-------+-----+-----+-----+
| Raj | b | d | f |
| Raj1 | bb | dd | ff |
+-------+-----+-----+-----+
```

Monday, July 2, 2018

Quickstart Druid Kafka Indexing Service

kafka indexing service

should have extensions loaded on overlord and middle-managers druid.extensions.loadList = ["druid-datasketches", "druid-hdfs-storage", "druid-kafka-indexing-service", "mysql-metadata-storage"]

supervisor spec

cat supervisor-spec.json
{
  "type": "kafka",
  "dataSchema": {
    "dataSource": "metrics-kafka",
    "parser": {
      "type": "string",
      "parseSpec": {
        "format": "json",
        "timestampSpec": {
          "column": "timestamp",
          "format": "auto"
        },
        "dimensionsSpec": {
          "dimensions": [],
          "dimensionExclusions": [
            "timestamp",
            "value"
          ]
        }
      }
    },
    "metricsSpec": [
      {
        "name": "count",
        "type": "count"
      },
      {
        "name": "value_sum",
        "fieldName": "value",
        "type": "doubleSum"
      },
      {
        "name": "value_min",
        "fieldName": "value",
        "type": "doubleMin"
      },
      {
        "name": "value_max",
        "fieldName": "value",
        "type": "doubleMax"
      }
    ],
    "granularitySpec": {
      "type": "uniform",
      "segmentGranularity": "HOUR",
      "queryGranularity": "NONE"
    }
  },
  "tuningConfig": {
    "type": "kafka",
    "maxRowsPerSegment": 5000000
  },
  "ioConfig": {
    "topic": "metrics",
    "consumerProperties": {
      "bootstrap.servers": "host2-node2:6667"
    },
    "taskCount": 1,
    "replicas": 1,
    "taskDuration": "PT1H"
  }
}

push the supervisor spec to overlord

curl -X POST -H 'Content-Type: application/json' -d @supervisor-spec.json http://host2-node2:8090/druid/indexer/v1/supervisor
{"id":"metrics-kafka"}

Get Supervisor IDs

[root@host2-node2 ~]# curl    http://host2-node2:8090/druid/indexer/v1/supervisor
["metrics-kafka"]

create kafka topic

/usr/hdp/current/kafka-broker/bin/kafka-topics.sh --create --zookeeper `hostname`:2181 --replication-factor 1 --partition 1 --topic metrics

start producing message to kafka

python generate-example-metrics -c 1  | /usr/hdp/current/kafka-broker/bin/kafka-console-producer.sh --broker-list `hostname -f`:6667 --topic metrics

now you can query this real time data. for datasource, check overlord and coordinator UI