1 使用PIP安装HDFS依赖包
可以直接从官网下载hdfs以来包:
但官网速度比较慢,可以搭建自己的PyPI源,或者使用本地的PIP源,具体配置如下:
Python 更改 PyPI 源
https://www.cndba.cn/dave/article/2261Python 搭建本地PIP源
https://www.cndba.cn/dave/article/2263
我们这里使用python 2.7 来操作HDFS。使用pip安装依赖包:
C:/Users/dave>pip install pyhdfs
DEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7.
Looking in indexes: https://pypi.mirrors.ustc.edu.cn/simple/
Collecting pyhdfs
Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/16/9f/a358e199f2d99229ff470e0063e26111fd0121ef034b853d9b9c0fd26b72/PyHDFS-0.2.1.tar.gz
Requirement already satisfied: requests in c:/python27/lib/site-packages (from pyhdfs) (2.14.2)
Collecting simplejson (from pyhdfs)
Downloading https://mirrors.tuna.tsinghua.edu.cn/pypi/web/packages/04/cc/ac40aec8710b4ae32e24b8f14e5868a39b54194622ae9b85f49091f8a793/simplejson-3.16.0-cp27-cp27m-win_amd64.whl (68kB)
|████████████████████████████████| 71kB 1.5MB/s
Installing collected packages: simplejson, pyhdfs
Running setup.py install for pyhdfs ... done
Successfully installed pyhdfs-0.2.1 simplejson-3.16.0
HDFS 能进行的操作可以参考:
https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/WebHDFS.html
github地址:
2 使用Python 操作HDFS
HDFS 环境搭建部分这里不在描述,可以参考我的博客:
Cloudera Manager及CDH 5.16 离线安装手册_详细版
https://www.cndba.cn/download/dave/9
我们这里主要参考pyhdfs的手册,对HDFS进行操作。
pyhdfs包的官方手册地址:
https://pyhdfs.readthedocs.io/en/latest/pyhdfs.html
我们对HDFS的操作主要是利用pyhdfs.HdfsClient类。 具体测试的Python 代码如下:
# -*- coding:utf-8 -*-
# @Time : 2019/5/7 12:26
# @Author : David Dai
# @FileName: hdfs.py
# @Software: PyCharm
# @Blog :https://www.cndba.cn/dave
import sys
import pyhdfs
class HDFS:
def __init__(self, host='127.0.0.1',user_name='hdfs'):
self.host = host
self.user_name=user_name
def get_con(self):
try:
hdfs = pyhdfs.HdfsClient(hosts = self.host,user_name = self.user_name)
return hdfs
except pyhdfs.HdfsException,e:
print "Error:%s" % e
# 返回指定目录下的所有文件
def listdir(self,oper):
try:
client = self.get_con()
dirs = client.listdir(oper)
for row in dirs:
print row
except pyhdfs.HdfsException, e:
print "Error:%s" % e
# 返回用户的根目录
def get_home_directory(self):
try:
client = self.get_con()
print client.get_home_directory()
except pyhdfs.HdfsException, e:
print "Error:%s" % e
# 返回可用的namenode节点
def get_active_namenode(self):
try:
client = self.get_con()
print client.get_active_namenode()
except pyhdfs.HdfsException, e:
print "Error:%s" % e
# 创建新目录
def mkdirs(self,oper):
try:
client = self.get_con()
print client.mkdirs(oper)
except pyhdfs.HdfsException, e:
print "Error:%s" % e
# 从集群上copy到本地
def copy_to_local(self, dest,localsrc):
try:
client = self.get_con()
print client.copy_to_local(dest,localsrc)
except pyhdfs.HdfsException, e:
print "Error:%s" % e
# 从本地上传文件至集群
def copy_from_local(self, localsrc, dest):
try:
client = self.get_con()
print client.copy_from_local(localsrc, dest)
except pyhdfs.HdfsException, e:
print "Error:%s" % e
# 查看文件内容
def read_files(self,oper):
try:
client = self.get_con()
response = client.open(oper)
print response.read()
except pyhdfs.HdfsException, e:
print "Error:%s" % e
# 向一个已经存在的文件追加内容
def append_files(self, file,content):
try:
client = self.get_con()
print client.append(file,content)
except pyhdfs.HdfsException, e:
print "Error:%s" % e
# 查看是否存在文件
def check_files(self,oper):
try:
client = self.get_con()
print client.exists(oper)
except pyhdfs.HdfsException, e:
print "Error:%s" % e
# 查看文件的校验和
def get_file_checksum(self,oper):
try:
client = self.get_con()
print client.get_file_checksum(oper)
except pyhdfs.HdfsException, e:
print "Error:%s" % e
# 查看路径总览信息
def get_content_summary(self,oper):
try:
client = self.get_con()
print client.get_content_summary(oper)
except pyhdfs.HdfsException, e:
print "Error:%s" % e
# 查看当前路径的状态
def list_status(self,oper):
try:
client = self.get_con()
print client.list_status(oper)
except pyhdfs.HdfsException, e:
print "Error:%s" % e
# 删除文件
def delete_files(self,path):
try:
client = self.get_con()
print client.delete(path)
except pyhdfs.HdfsException, e:
print "Error:%s" % e
if __name__ == '__main__':
db = HDFS('hadoop-master1','hdfs')
# db.listdir('/user')
# db.get_home_directory()
# db.get_active_namenode()
# db.mkdirs('/dave')
# db.copy_from_local("D:/dave.txt","/dave/dave.txt")
# db.listdir('/dave')
# db.read_files('/dave/dave.txt')
# db.check_files('/dave/dave.txt')
# db.get_file_checksum('/dave/dave.txt')
# db.get_content_summary('/')
# db.list_status('/')
# db.list_status('/dave/dave.txt')
# db.copy_to_local("/dave/dave.txt","D:/cndba.txt")
# db.append_files('/dave/dave.txt',"88,安徽DBA俱乐部")
# db.read_files('/dave/dave.txt')
# db.copy_from_local("D:/cndba.txt", "/dave/cndba.txt")
db.listdir('/dave')
# db.delete_files('/dave/cndba.txt')
版权声明:本文为博主原创文章,未经博主允许不得转载。