File tree Expand file tree Collapse file tree 3 files changed +27
-15
lines changed
java/org/metafacture/html
test/java/org/metafacture/html Expand file tree Collapse file tree 3 files changed +27
-15
lines changed Original file line number Diff line number Diff line change 3030import org .metafacture .framework .helpers .DefaultObjectPipe ;
3131
3232/**
33- * Extracts the first script from an HTML document
33+ * Extracts the the specified element from an HTML document
3434 *
3535 * @author Fabian Steeg
3636 */
37- @ Description ("Extracts the first script from an HTML document" )
37+ @ Description ("Extracts the specified element from an HTML document" )
3838@ In (Reader .class )
3939@ Out (String .class )
40- @ FluxCommand ("extract-script" )
41- public class ScriptExtractor extends DefaultObjectPipe <Reader , ObjectReceiver <String >> {
40+ @ FluxCommand ("extract-element" )
41+ public class ElementExtractor extends DefaultObjectPipe <Reader , ObjectReceiver <String >> {
42+ private String selector ;
43+
44+ /**
45+ * @param selector The CSS-style jsoup selector, see https://jsoup.org/cookbook/extracting-data/selector-syntax
46+ */
47+ public ElementExtractor (final String selector ) {
48+ this .selector = selector ;
49+ }
50+
4251 @ Override
4352 public void process (final Reader reader ) {
4453 try {
4554 Document document = Jsoup .parse (IOUtils .toString (reader ));
46- Element firstScript = document .select ("script" ).first ();
47- getReceiver ().process (firstScript .data ());
55+ Element firstElement = document .select (selector ).first ();
56+ getReceiver ().process (firstElement .data ());
4857 } catch (IOException e ) {
4958 e .printStackTrace ();
5059 }
Original file line number Diff line number Diff line change 1414# limitations under the License.
1515#
1616decode-html org.metafacture.html.HtmlDecoder
17- extract-script org.metafacture.html.ScriptExtractor
17+ extract-element org.metafacture.html.ElementExtractor
Original file line number Diff line number Diff line change 2828import org .mockito .MockitoAnnotations ;
2929
3030/**
31- * Tests for {@link ScriptExtractor }.
31+ * Tests for {@link ElementExtractor }.
3232 *
3333 * @author Fabian Steeg
3434 *
3535 */
36- public final class ScriptExtractorTest {
36+ public final class ElementExtractorTest {
3737
38- private static final StringReader IN = new StringReader ("<html><script>{\" code\" :\" yo\" }" );
38+ private static final StringReader IN = new StringReader ("<html>"
39+ + "<script data-test='site-head-data'>{\" code\" :\" hey\" }</script>"
40+ + "<script data-test='model-linked-data'>{\" code\" :\" yo\" }" );
41+
3942 private static final String OUT = "{\" code\" :\" yo\" }" ;
4043
41- private ScriptExtractor scriptExtractor ;
44+ private ElementExtractor elementExtractor ;
4245
4346 @ Mock
4447 private ObjectReceiver <String > receiver ;
4548
4649 @ Before
4750 public void setup () {
4851 MockitoAnnotations .initMocks (this );
49- scriptExtractor = new ScriptExtractor ( );
50- scriptExtractor .setReceiver (receiver );
52+ elementExtractor = new ElementExtractor ( "script[data-test=model-linked-data]" );
53+ elementExtractor .setReceiver (receiver );
5154 }
5255
5356 @ Test
5457 public void testShouldProcessRecordsFollowedbySeparator () {
55- scriptExtractor .process (IN );
58+ elementExtractor .process (IN );
5659 verify (receiver ).process (OUT );
5760 verifyNoMoreInteractions (receiver );
5861 }
5962
6063 @ After
6164 public void cleanup () {
62- scriptExtractor .closeStream ();
65+ elementExtractor .closeStream ();
6366 }
6467}
You can’t perform that action at this time.
0 commit comments