Skip to content

Processing Errors are now Stored in the ElasticSearch Index #48

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 24 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
3ecf6bd
First shot at POM reading, extracts PublicationDate, Description and …
Jul 9, 2020
e96b08e
Made style of pom file processing more inline with rest of application.
Jul 9, 2020
b3a2dfa
Add test for pom file reader
Jul 9, 2020
cc9ffda
Proper error handling in POM file reading actor
Jul 9, 2020
ece0b24
Added processing of issue management system to POM reader.
Jul 9, 2020
d254798
Revert unnecessary changes (whitespaces)
Jul 13, 2020
f063cac
Moved PomFileReadActor to processing package, changed behavior on fai…
Jul 15, 2020
2740992
Added storage trait for POM file properties. Now also extracting lice…
Aug 2, 2020
7c84ead
Add dependency extraction for POM files. Some basic variable resolvin…
Sep 17, 2020
82748c9
Recursively resolve POM variables in parents if possible. On-Demand a…
Sep 17, 2020
7440e8a
Fix code smell
Sep 17, 2020
24aed36
Remove code duplication in test
Sep 17, 2020
577f543
PomReadActor now also resolves dependencies where version is not spec…
Sep 18, 2020
5e535d2
Optimized dependency resolving. Versions are now resolved throughout …
Sep 18, 2020
8853ed7
Optimization: Parent hierarchy is now lazy, ie only downloaded if at …
Sep 19, 2020
0992b64
Now extracting scopes for dependencies from POM files
Sep 21, 2020
221ff7d
Now extracting parent and packaging. Fixed some storage issues
Sep 21, 2020
1b0e71f
Fixed a bug in actor communication. Code style improvements
Sep 21, 2020
9ae5e3c
Adapt tests to latest actor api change
Oct 8, 2020
06e1b74
Some restructuring to prepare persistent error storage
Oct 8, 2020
6d3eafa
First version of full error redirecting, not yet stored anywhere
Oct 13, 2020
b160740
First working version that stores errors in elastic using a new type …
Oct 19, 2020
2078bd7
Adapted tests to last change in actor APIs
Oct 19, 2020
73560af
Fixed bug in elastic data model regarding error storage
Oct 19, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ import de.upb.cs.swt.delphi.crawler.{AppLogging, Configuration}
import de.upb.cs.swt.delphi.crawler.control.Phase
import de.upb.cs.swt.delphi.crawler.control.Phase.Phase
import de.upb.cs.swt.delphi.crawler.tools.ActorStreamIntegrationSignals.{Ack, StreamCompleted, StreamFailure, StreamInitialized}
import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenDownloadActor}
import de.upb.cs.swt.delphi.crawler.processing.{HermesActor, HermesResults}
import de.upb.cs.swt.delphi.crawler.preprocessing.{MavenArtifact, MavenArtifactMetadata, MavenDownloadActor, MavenDownloadActorResponse}
import de.upb.cs.swt.delphi.crawler.processing.{HermesActor, HermesActorResponse, HermesResults, PomFileReadActor, PomFileReadActorResponse, ProcessingFailureStorageActor}
import de.upb.cs.swt.delphi.crawler.storage.ArtifactExistsQuery
import de.upb.cs.swt.delphi.crawler.tools.NotYetImplementedException

Expand Down Expand Up @@ -57,6 +57,8 @@ class MavenDiscoveryProcess(configuration: Configuration, elasticPool: ActorRef)
private val seen = mutable.HashSet[MavenIdentifier]()

val downloaderPool = system.actorOf(SmallestMailboxPool(8).props(MavenDownloadActor.props))
val pomReaderPool = system.actorOf(SmallestMailboxPool(8).props(PomFileReadActor.props(configuration)))
val errorHandlerPool = system.actorOf(SmallestMailboxPool(8).props(ProcessingFailureStorageActor.props(elasticPool)))
val hermesPool = system.actorOf(SmallestMailboxPool(configuration.hermesActorPoolSize).props(HermesActor.props()))

override def phase: Phase = Phase.Discovery
Expand Down Expand Up @@ -86,15 +88,21 @@ class MavenDiscoveryProcess(configuration: Configuration, elasticPool: ActorRef)
val preprocessing =
filteredSource
.alsoTo(createSinkFromActorRef[MavenIdentifier](elasticPool))
.mapAsync(8)(identifier => (downloaderPool ? identifier).mapTo[Try[MavenArtifact]])
.filter(artifact => artifact.isSuccess)
.map(artifact => artifact.get)
.mapAsync(8)(identifier => (downloaderPool ? identifier).mapTo[MavenDownloadActorResponse])
.alsoTo(createSinkFromActorRef[MavenDownloadActorResponse](errorHandlerPool))
.filter(!_.pomDownloadFailed)

val finalizer =
preprocessing
.mapAsync(configuration.hermesActorPoolSize)(artifact => (hermesPool ? artifact).mapTo[Try[HermesResults]])
.filter(results => results.isSuccess)
.map(results => results.get)
.mapAsync(8)(downloadResponse => (pomReaderPool ? downloadResponse).mapTo[PomFileReadActorResponse])
.alsoTo(createSinkFromActorRef[PomFileReadActorResponse](errorHandlerPool))
.alsoTo(createSinkFromActorRef[PomFileReadActorResponse](elasticPool))
.filter(response => !response.jarDownloadFailed)
.map(_.artifact)
.mapAsync(configuration.hermesActorPoolSize)(artifact => (hermesPool ? artifact).mapTo[HermesActorResponse])
.alsoTo(createSinkFromActorRef[HermesActorResponse](errorHandlerPool))
.filter(_.result.isSuccess)
.map(_.result.get)
.alsoTo(createSinkFromActorRef[HermesResults](elasticPool))
.to(Sink.ignore)
.run()
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
// Copyright (C) 2018 The Delphi Team.
// See the LICENCE file distributed with this work for additional
// information regarding copyright ownership.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package de.upb.cs.swt.delphi.crawler.discovery.maven

import org.joda.time.DateTime

case class MavenProcessingError(identifier: MavenIdentifier,
occurredAt: DateTime,
errorType: MavenErrorType.Value,
message: String)

object MavenErrorType extends Enumeration {
type MavenErrorType = Value

val PomDownloadFailed, JarDownloadFailed, PomParsingFailed, HermesProcessingFailed = Value
}


object MavenProcessingError {

private def createError(identifier: MavenIdentifier, errorType: MavenErrorType.Value, message: String) =
MavenProcessingError(identifier, DateTime.now(), errorType, message)

def createPomDownloadError(identifier: MavenIdentifier, message: String): MavenProcessingError =
createError(identifier, MavenErrorType.PomDownloadFailed, message)

def createJarDownloadError(identifier: MavenIdentifier, message: String): MavenProcessingError =
createError(identifier, MavenErrorType.JarDownloadFailed, message)

def createPomParsingError(identifier: MavenIdentifier, message: String): MavenProcessingError =
createError(identifier, MavenErrorType.PomParsingFailed, message)

def createHermesProcessingError(identifier: MavenIdentifier, message: String): MavenProcessingError =
createError(identifier, MavenErrorType.HermesProcessingFailed, message)
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,26 @@
package de.upb.cs.swt.delphi.crawler.preprocessing

import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier
import org.joda.time.DateTime

case class MavenArtifact(identifier : MavenIdentifier, jarFile: JarFile, pomFile: PomFile)
case class MavenArtifact(identifier : MavenIdentifier, jarFile: Option[JarFile], pomFile: PomFile,
publicationDate: Option[DateTime], metadata: Option[MavenArtifactMetadata])

case class MavenArtifactMetadata(name: String,
description: String,
developers: List[String],
licenses: List[ArtifactLicense],
issueManagement: Option[IssueManagementData],
dependencies: Set[ArtifactDependency],
parent:Option[MavenIdentifier],
packaging: String)

case class IssueManagementData(system: String, url: String)
case class ArtifactLicense(name: String, url:String)
case class ArtifactDependency(identifier: MavenIdentifier, scope: Option[String])

object MavenArtifact{
def withMetadata(artifact: MavenArtifact, metadata: MavenArtifactMetadata): MavenArtifact = {
MavenArtifact(artifact.identifier, artifact.jarFile, artifact.pomFile, artifact.publicationDate, Some(metadata))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,48 +16,73 @@

package de.upb.cs.swt.delphi.crawler.preprocessing

import java.util.Locale

import akka.actor.{Actor, ActorLogging, ActorSystem, Props}
import de.upb.cs.swt.delphi.crawler.discovery.maven.MavenIdentifier
import de.upb.cs.swt.delphi.crawler.tools.HttpDownloader
import org.joda.time.format.DateTimeFormat

import scala.util.{Failure, Success}
import scala.util.{Failure, Success, Try}

class MavenDownloadActor extends Actor with ActorLogging {

override def receive: Receive = {
case m : MavenIdentifier => {
case m : MavenIdentifier =>
implicit val system : ActorSystem = context.system

val downloader = new HttpDownloader

val jarStream = downloader.downloadFromUri(m.toJarLocation.toString())
val pomStream = downloader.downloadFromUri(m.toPomLocation.toString())

jarStream match {
case Success(jar) => {
pomStream match {
case Success(pom) => {
log.info(s"Downloaded $m")
sender() ! Success(MavenArtifact(m, JarFile(jar, m.toJarLocation.toURL), PomFile(pom)))
}
case Failure(e) => {
// TODO: push error to actor
log.warning(s"Failed pom download for $m")
sender() ! Failure(e)
}
val pomResponse = downloader.downloadFromUriWithHeaders(m.toPomLocation.toString)

pomResponse match {
case Success((pomStream, pomHeaders)) =>
log.info(s"Downloaded $m")

// Extract and parse publication date from header
val datePattern = DateTimeFormat.forPattern("E, dd MMM yyyy HH:mm:ss zzz").withLocale(Locale.ENGLISH)
val pomPublicationDate = pomHeaders.find( _.lowercaseName().equals("last-modified") )
.map( header => Try(datePattern.parseDateTime(header.value())) ) match {
case Some(Success(date)) => Some(date)
case Some(Failure(x)) =>
log.warning(s"Failed to extract publication date for $m: ${x.getMessage}")
None
case _ => None
}
}
case Failure(e) => {
// TODO: push error to actor
log.warning(s"Failed jar download for $m")
sender() ! Failure(e)
}
}

downloader.downloadFromUri(m.toJarLocation.toString) match {
case Success(jar) =>
sender() ! MavenDownloadActorResponse(
m,
Some(MavenArtifact(m, Some(JarFile(jar, m.toJarLocation.toURL)), PomFile(pomStream), pomPublicationDate, None)),
dateParsingFailed = pomPublicationDate.isEmpty)
case Failure(ex) =>
log.warning(s"Failed to download jar file for $m")
sender() ! MavenDownloadActorResponse(
m,
Some(MavenArtifact(m, None, PomFile(pomStream), pomPublicationDate, None)),
jarDownloadFailed = true,
dateParsingFailed = pomPublicationDate.isEmpty,
errorMessage = ex.getMessage
)
}

case Failure(ex) =>
log.error(s"Failed to download pom file for $m with message: ${ex.getMessage}")
sender() ! MavenDownloadActorResponse(m, None, pomDownloadFailed = true, errorMessage = ex.getMessage)
}

}
}
}

case class MavenDownloadActorResponse(identifier: MavenIdentifier,
artifact: Option[MavenArtifact],
pomDownloadFailed: Boolean = false,
jarDownloadFailed: Boolean = false,
dateParsingFailed: Boolean = false,
errorMessage: String = "")

object MavenDownloadActor {
def props = Props(new MavenDownloadActor)
def props: Props = Props(new MavenDownloadActor)
}

Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ class HermesActor() extends Actor with ActorLogging with OPALFunctionality with
computeHermesResult(m, reifyProject(m))
}

sender() ! hermesResult
sender() ! HermesActorResponse(m.identifier, hermesResult)
}
}
}
Expand All @@ -46,4 +46,6 @@ object HermesActor {

}

case class HermesResults(identifier: MavenIdentifier, featureMap: Map[String, Int])
case class HermesResults(identifier: MavenIdentifier, featureMap: Map[String, Int])

case class HermesActorResponse(identifier: MavenIdentifier, result: Try[HermesResults])
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,8 @@ trait OPALFunctionality {

def reifyProject(m: MavenArtifact): Project[URL] = {
val project = new ClassStreamReader {}.createProject(m.identifier.toJarLocation.toURL,
new JarInputStream(m.jarFile.is))
Try(m.jarFile.is.close())
new JarInputStream(m.jarFile.get.is))
Try(m.jarFile.get.is.close())
project
}
}
Loading