Skip to content
Wade Salazar edited this page Jun 28, 2017 · 13 revisions

Welcome to the wellbook wiki!


Install Deps

yum install epel-release yum groupinstall -y 'development tools'

sudo yum install -y python-devel libxslt-devel blas-devel lapack-devel gcc-gfortran


Python needs to be 2.7

wget https://www.python.org/ftp/python/2.7.12/Python-2.7.12.tgz

yum install python-pip pip install pyquery numpy scipy scikit-learn pip install virtualenv


Build Portable Virtual Env ( this is packaged & distributed with MR jobs )

virtualenv ~/wellbook/pyenv

sudo cp las.py /usr/lib64/python2.7/site-packages

sudo cp recordhelper.py /usr/lib64/python2.7/site-packages


Build Hive Custom InputFormat

git clone https://github.com/randerzander/SequenceFileKeyValueInputFormat

mvn package


Install Mahout

yum install mahout sudo ln -sfn /usr/hdp/2.4.2.0-258/mahout /usr/hdp/current/mahout-client

mahout seqdirectory -i wellbook/production_raw -o wellbook/production_seq -prefix __key -ow


Install Maven

cd /tmp

wget http://www-eu.apache.org/dist/maven/maven-3/3.3.9/binaries/apache-maven-3.3.9-bin.tar.gz

tar -xf apache-maven-3.3.9-bin.tar.gz

mv apache-maven-3.3.9 /usr/share/maven


TEST

Check python scripts

cat ~/las_raw/17810-CBL.las | python2.7 las_readings.py filename,file_no,log_name,step_type,step,mnemonic,uom,reading las

Check Hive & Sequence Files

!connect jdbc:hive2://server-ip:10000/default

drop table if exists stage; create external table stage(filename string, text string) stored as inputformat 'com.github.randerzander.SequenceFileKeyValueInputFormat' outputformat 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat' location '/user/hdfs/wellbook/las_seq';

select transform(filename, text) using 'python las_metadata.py' as error,filename,file_no,log_name,block,mnemonic,uom,description from stage LIMIT 10

select transform(filename, text) using 'python las_readings.py' as filename,file_no,log_name,step_type,step,mnemonic,uom,reading from stage LIMIT 1

from ( select transform(filename, text) using 'python las_readings.py' as error,filename,file_no,log_name,step_type,step,mnemonic,uom,reading from stage LIMIT 100) source select error,filename,file_no,log_name,step_type,step,mnemonic,uom,reading where error = ''

hive -f job.hql -hiveconf SCRIPT=las_readings.py -hiveconf COLUMNS=filename,file_no,log_name,step_type,step,mnemonic,uom,reading -hiveconf SOURCE=/user/dev/wellbook/las_seq -hiveconf TARGET=log_readings