更新时间:2022-10-14 20:29:51
通过将字节流写入本地文件,然后打开该文件为h5,使用这个。这里是我的代码:
var tarFiles:Array [String] = Array()
val tar_path = path +数百万字节集.tar.gz
// TODO:将主文件夹路径中的所有tar.gz文件添加到tarFiles数组
//在这里添加尽可能多的tar.gz文件,歌曲的
// hdf5文件
tarFiles = tarFiles:+ tar_path
// tarFiles = tarFiles:+(path +A.tar.gz)
// tarFiles = tarFiles:+(path +B.tar.gz)
// tarFiles = tarFiles:+(path +C.tar.gz)
//读取所有焦油。 gz文件放入tar文件列表中,并为每个.h5
//文件内的文件提取每首歌曲的功能列表。
//因此,它会获取文件中所有歌曲的功能列表。
var allHDF5 = sc.parallelize(tarFiles).flatMap(path => {
val tar = new TarArchiveInputStream(new GzipCompressorInputStream(new FileInputStream(path)))
var entry:TarArchiveEntry = tar.getNextEntry()。asInstanceOf [TarArchiveEntry]
var res:List [Array [Byte]] = List()
var i = 0
while(entry!= null){
var outputFile:File = new File(entry.getName());
if(!entry.isDirectory()&&& entry.getName.contains(。h5)){
var byteFile = Array.ofDim [Byte](entry.getSize.toInt)
tar.read(byteFile);
res = byteFile :: res
if(i%100 == 0) {
println(Read+ i +files)
}
i = i + 1
}
entry = tar.getNextEntry() .asInstanceOf [TarArchiveEntry]
}
//所有文件都转换为字节数组
res
)).map(bytes => {
// toString方法d用作文件$ U
$ name = bytes.toString()
FileUtils.writeByteArrayToFile(new File(name),bytes)
val reader = HDF5Factory.openForReading(name)
val features = getFeatures(reader)
reader.close()
features
})
println(从tar.gz中提取歌曲,显示5个例子)
allHDF5.take(5).foreach(x => {x.foreach(y => print(y +))
println()})
几点评论:
After referencing to this post, I could read multiple *.txt files residing in a *.tar.gz file. But for now, I need to read HDF5 files in a *.tar.gz file. The sample file could be downloaded here, which is generated from million songs dataset. Could anyone tell me how I should change the following code in order to read HDF5 files into RDD? Thanks!
package a.b.c
import org.apache.spark._
import org.apache.spark.sql.{SQLContext, DataFrame}
import org.apache.spark.ml.tuning.CrossValidatorModel
import org.apache.spark.ml.regression.LinearRegressionModel
import org.apache.spark.ml.{Pipeline, PipelineModel}
import org.apache.spark.ml.regression.LinearRegression
import org.apache.spark.input.PortableDataStream
import org.apache.commons.compress.archivers.tar.TarArchiveInputStream
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream
import scala.util.Try
import java.nio.charset._
object Main {
def main(args: Array[String]) {
val conf = new SparkConf().setAppName("lab1").setMaster("local")
val sc = new SparkContext(conf)
val sqlContext = new SQLContext(sc)
import sqlContext.implicits._
import sqlContext._
val inputpath = "path/to/millionsong.tar.gz"
val rawDF = sc.binaryFiles(inputpath, 2)
.flatMapValues(x => extractFiles(x).toOption)
.mapValues(_.map(decode()))
.map(_._2)
.flatMap(x => x)
.flatMap { x => x.split("\n") }
.toDF()
}
def extractFiles(ps: PortableDataStream, n: Int = 1024) = Try {
val tar = new TarArchiveInputStream(new GzipCompressorInputStream(ps.open))
Stream.continually(Option(tar.getNextTarEntry))
// Read until next exntry is null
.takeWhile(_.isDefined)
// flatten
.flatMap(x => x)
// Drop directories
.filter(!_.isDirectory)
.map(e => {
Stream.continually {
// Read n bytes
val buffer = Array.fill[Byte](n)(-1)
val i = tar.read(buffer, 0, n)
(i, buffer.take(i))}
// Take as long as we've read something
.takeWhile(_._1 > 0)
.map(_._2)
.flatten
.toArray})
.toArray
}
def decode(charset: Charset = StandardCharsets.UTF_8)(bytes: Array[Byte]) = new String(bytes, StandardCharsets.UTF_8)
}
I managed to read the HDF5 files within the tarball by writing the byte stream into a local file and then opening this file as h5, extracting the features using this. Here is my code:
var tarFiles: Array[String] = Array()
val tar_path = path + "millionsongsubset.tar.gz"
//TODO: add all your tar.gz files in main folder path to tarFiles array
//should add here as many tar.gz files as wanted containing the
//hdf5 files for the songs
tarFiles = tarFiles :+ tar_path
//tarFiles = tarFiles :+ (path+"A.tar.gz")
//tarFiles = tarFiles :+ (path+"B.tar.gz")
//tarFiles = tarFiles :+ (path+"C.tar.gz")
//This reads all tar.gz files in tarFiles list, and for each .h5
//file within, it extracts each song's list of features.
//Thus, it gets a list of features for all songs in the files.
var allHDF5 = sc.parallelize(tarFiles).flatMap(path => {
val tar = new TarArchiveInputStream(new GzipCompressorInputStream(new FileInputStream(path)))
var entry: TarArchiveEntry = tar.getNextEntry().asInstanceOf[TarArchiveEntry]
var res: List[Array[Byte]] = List()
var i = 0
while (entry != null) {
var outputFile:File = new File(entry.getName());
if (!entry.isDirectory() && entry.getName.contains(".h5")) {
var byteFile = Array.ofDim[Byte](entry.getSize.toInt)
tar.read(byteFile);
res = byteFile :: res
if(i%100==0) {
println("Read " + i + " files")
}
i = i+1
}
entry = tar.getNextEntry().asInstanceOf[TarArchiveEntry]
}
//All files are turned into byte arrays
res
} ).map(bytes => {
// The toString method is used as a UUID for the file
val name = bytes.toString()
FileUtils.writeByteArrayToFile(new File(name), bytes)
val reader = HDF5Factory.openForReading(name)
val features = getFeatures(reader)
reader.close()
features
})
println("Extracted songs from tar.gz, showing 5 examples")
allHDF5.take(5).foreach(x => { x.foreach(y => print(y+" "))
println()})
Several remarks: