Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TIKA-3351 Don't allow dups in Parsed-By metadata #425

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 13 additions & 7 deletions tika-core/src/main/java/org/apache/tika/metadata/Metadata.java
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ public class Metadata
* Constructs a new, empty metadata.
*/
public Metadata() {
metadata = new HashMap<String, String[]>();
metadata = new HashMap<>();
}

private static DateFormat createDateFormat(String format, TimeZone timezone) {
Expand Down Expand Up @@ -116,7 +116,7 @@ public String[] names() {
}

/**
* Get the value associated to a metadata name. If many values are assiociated
* Get the value associated to a metadata name. If many values are associated
* to the specified name, then the first one is returned.
*
* @param name of the metadata.
Expand Down Expand Up @@ -221,11 +221,17 @@ private String[] _getValues(final String name) {
return values;
}

private String[] appendedValues(String[] values, final String value) {
// private String[] appendedValues(String[] values, final String value) {
// return appendedValues(values, value, true);
// }

private String[] appendedValues(String[] values, final String value, boolean allowDups) {
String[] newValues = new String[values.length + 1];
System.arraycopy(values, 0, newValues, 0, values.length);
newValues[newValues.length - 1] = value;
return newValues;
return allowDups
? newValues
: Arrays.stream(newValues).distinct().toArray(String[]::new);
}

/**
Expand All @@ -240,7 +246,7 @@ public void add(final String name, final String value) {
if (values == null) {
set(name, value);
} else {
metadata.put(name, appendedValues(values, value));
metadata.put(name, appendedValues(values, value, true));
}
}

Expand Down Expand Up @@ -270,7 +276,7 @@ public void add(final Property property, final String value) {
set(property, value);
} else {
if (property.isMultiValuePermitted()) {
set(property, appendedValues(values, value));
set(property, appendedValues(values, value, property.isDupsPermitted()));
} else {
throw new PropertyTypeException(
property.getName() + " : " + property.getPropertyType());
Expand Down Expand Up @@ -548,7 +554,7 @@ public boolean equals(Object o) {
}

public String toString() {
StringBuffer buf = new StringBuffer();
StringBuilder buf = new StringBuilder();
String[] names = names();
for (String name : names) {
String[] values = _getValues(name);
Expand Down
26 changes: 21 additions & 5 deletions tika-core/src/main/java/org/apache/tika/metadata/Property.java
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
*/
public final class Property implements Comparable<Property> {

private static final Map<String, Property> properties = new HashMap<String, Property>();
private static final Map<String, Property> properties = new HashMap<>();
private final String name;
private final boolean internal;
private final PropertyType propertyType;
Expand All @@ -57,7 +57,7 @@ private Property(String name, boolean internal, PropertyType propertyType, Value
this.valueType = valueType;
if (choices != null) {
this.choices = Collections
.unmodifiableSet(new HashSet<String>(Arrays.asList(choices.clone())));
.unmodifiableSet(new HashSet<>(Arrays.asList(choices.clone())));
} else {
this.choices = null;
}
Expand Down Expand Up @@ -120,7 +120,7 @@ public static Property get(String key) {
}

public static SortedSet<Property> getProperties(String prefix) {
SortedSet<Property> set = new TreeSet<Property>();
SortedSet<Property> set = new TreeSet<>();
String p = prefix + ":";
synchronized (properties) {
for (String name : properties.keySet()) {
Expand Down Expand Up @@ -172,6 +172,10 @@ public static Property internalTextBag(String name) {
return new Property(name, true, PropertyType.BAG, ValueType.TEXT);
}

public static Property internalTextSet(String name) {
return new Property(name, true, PropertyType.SET, ValueType.TEXT);
}

public static Property internalURI(String name) {
return new Property(name, true, ValueType.URI);
}
Expand Down Expand Up @@ -216,6 +220,10 @@ public static Property externalTextBag(String name) {
return new Property(name, false, PropertyType.BAG, ValueType.TEXT);
}

public static Property externalTextSet(String name) {
return new Property(name, false, PropertyType.SET, ValueType.TEXT);
}

/**
* Constructs a new composite property from the given primary and array of secondary properties.
* <p>
Expand Down Expand Up @@ -266,8 +274,8 @@ public boolean isExternal() {
* Is the PropertyType one which accepts multiple values?
*/
public boolean isMultiValuePermitted() {
if (propertyType == PropertyType.BAG || propertyType == PropertyType.SEQ ||
propertyType == PropertyType.ALT) {
if (propertyType == PropertyType.BAG || propertyType == PropertyType.SET
|| propertyType == PropertyType.SEQ || propertyType == PropertyType.ALT) {
return true;
} else if (propertyType == PropertyType.COMPOSITE) {
// Base it on the primary property's behaviour
Expand All @@ -276,6 +284,10 @@ public boolean isMultiValuePermitted() {
return false;
}

public boolean isDupsPermitted() {
return propertyType != PropertyType.SET;
}

public PropertyType getPropertyType() {
return propertyType;
}
Expand Down Expand Up @@ -338,6 +350,10 @@ public enum PropertyType {
* An un-ordered array
*/
BAG,
/**
* An un-ordered array with unique values
*/
SET,
/**
* An ordered array
*/
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -87,7 +87,7 @@ public interface TikaCoreProperties {
*/
Property TIKA_META_EXCEPTION_EMBEDDED_STREAM =
Property.internalTextBag(TIKA_META_EXCEPTION_PREFIX + "embedded_stream_exception");
Property TIKA_PARSED_BY = Property.internalTextBag(TIKA_META_PREFIX + "Parsed-By");
Property TIKA_PARSED_BY = Property.internalTextSet(TIKA_META_PREFIX + "Parsed-By");
String RESOURCE_NAME_KEY = "resourceName";
String PROTECTED = "protected";
String EMBEDDED_RELATIONSHIP_ID = "embeddedRelationshipId";
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -196,11 +196,10 @@ public void testSupplemental() throws Exception {
assertEquals("Test1", metadata.get("TBoth"));

usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
assertEquals(4, usedParsers.length);
assertEquals(3, usedParsers.length); // DummyParser is only added once
assertEquals(ErrorParser.class.getName(), usedParsers[0]);
assertEquals(DummyParser.class.getName(), usedParsers[1]);
assertEquals(DummyParser.class.getName(), usedParsers[2]);
assertEquals(EmptyParser.class.getName(), usedParsers[3]);
assertEquals(EmptyParser.class.getName(), usedParsers[2]);


// Last Wins
Expand All @@ -217,11 +216,10 @@ public void testSupplemental() throws Exception {
assertEquals("Test2", metadata.get("TBoth"));

usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
assertEquals(4, usedParsers.length);
assertEquals(3, usedParsers.length); // DummyParser is only added once
assertEquals(ErrorParser.class.getName(), usedParsers[0]);
assertEquals(DummyParser.class.getName(), usedParsers[1]);
assertEquals(DummyParser.class.getName(), usedParsers[2]);
assertEquals(EmptyParser.class.getName(), usedParsers[3]);
assertEquals(EmptyParser.class.getName(), usedParsers[2]);


// Merge
Expand All @@ -240,11 +238,10 @@ public void testSupplemental() throws Exception {
assertEquals("Test2", metadata.getValues("TBoth")[1]);

usedParsers = metadata.getValues(TikaCoreProperties.TIKA_PARSED_BY);
assertEquals(4, usedParsers.length);
assertEquals(3, usedParsers.length); // DummyParser is only added once
assertEquals(ErrorParser.class.getName(), usedParsers[0]);
assertEquals(DummyParser.class.getName(), usedParsers[1]);
assertEquals(DummyParser.class.getName(), usedParsers[2]);
assertEquals(EmptyParser.class.getName(), usedParsers[3]);
assertEquals(EmptyParser.class.getName(), usedParsers[2]);


// Check the error details always come through, no matter the policy
Expand Down