diff --git a/build.gradle.kts b/build.gradle.kts index 070fad1..7989e8b 100644 --- a/build.gradle.kts +++ b/build.gradle.kts @@ -6,7 +6,6 @@ plugins { id("org.jlleitschuh.gradle.ktlint") version "11.3.2" kotlin("jvm") version "1.9.21" kotlin("plugin.spring") version "1.9.21" - kotlin("plugin.jpa") version "1.9.24" } group = "com.yourssu" @@ -29,8 +28,10 @@ dependencies { implementation("org.jetbrains.kotlin:kotlin-reflect") implementation("io.projectreactor.kotlin:reactor-kotlin-extensions") implementation("org.jetbrains.kotlinx:kotlinx-coroutines-reactor") - implementation("org.springframework.boot:spring-boot-starter-data-jpa") - runtimeOnly("com.mysql:mysql-connector-j") + // implementation("org.springframework.boot:spring-boot-starter-data-jpa") + implementation("org.springframework.boot:spring-boot-starter-data-r2dbc") + // implementation("io.r2dbc:r2dbc-pool") + implementation("org.mariadb:r2dbc-mariadb:1.1.2") implementation("org.springdoc:springdoc-openapi-starter-webmvc-ui:2.1.0") diff --git a/src/main/kotlin/com/yourssu/search/SearchApplication.kt b/src/main/kotlin/com/yourssu/search/SearchApplication.kt index fde79c2..1abcff1 100644 --- a/src/main/kotlin/com/yourssu/search/SearchApplication.kt +++ b/src/main/kotlin/com/yourssu/search/SearchApplication.kt @@ -2,7 +2,13 @@ package com.yourssu.search import org.springframework.boot.autoconfigure.SpringBootApplication import org.springframework.boot.runApplication +import org.springframework.data.r2dbc.config.EnableR2dbcAuditing +import org.springframework.data.r2dbc.repository.config.EnableR2dbcRepositories +import org.springframework.scheduling.annotation.EnableScheduling +@EnableScheduling +@EnableR2dbcRepositories +@EnableR2dbcAuditing @SpringBootApplication class SearchApplication diff --git a/src/main/kotlin/com/yourssu/search/crawling/config/R2dbcConfig.kt b/src/main/kotlin/com/yourssu/search/crawling/config/R2dbcConfig.kt new file mode 100644 index 0000000..08ed58c --- /dev/null +++ b/src/main/kotlin/com/yourssu/search/crawling/config/R2dbcConfig.kt @@ -0,0 +1,15 @@ +package com.yourssu.search.crawling.config + +import io.r2dbc.spi.ConnectionFactory +import org.springframework.context.annotation.Bean +import org.springframework.context.annotation.Configuration +import org.springframework.data.r2dbc.core.R2dbcEntityTemplate + +@Configuration +class R2dbcConfig(private val connectionFactory: ConnectionFactory) { + + @Bean + fun r2dbcEntityTemplate(): R2dbcEntityTemplate { + return R2dbcEntityTemplate(connectionFactory) + } +} diff --git a/src/main/kotlin/com/yourssu/search/crawling/domain/InformationUrl.kt b/src/main/kotlin/com/yourssu/search/crawling/domain/InformationUrl.kt index c5757cc..d668eff 100644 --- a/src/main/kotlin/com/yourssu/search/crawling/domain/InformationUrl.kt +++ b/src/main/kotlin/com/yourssu/search/crawling/domain/InformationUrl.kt @@ -1,25 +1,17 @@ package com.yourssu.search.crawling.domain -import jakarta.persistence.Column -import jakarta.persistence.Entity -import jakarta.persistence.EnumType -import jakarta.persistence.Enumerated -import jakarta.persistence.GeneratedValue -import jakarta.persistence.GenerationType -import jakarta.persistence.Id -import jakarta.persistence.Table +import org.springframework.data.annotation.Id +import org.springframework.data.relational.core.mapping.Column +import org.springframework.data.relational.core.mapping.Table -@Entity -@Table(name = "information_url") +@Table("information_url") class InformationUrl( - @field:Id - @field:GeneratedValue(strategy = GenerationType.IDENTITY) + @Id val id: Long? = null, - @field:Column(name = "content_url", nullable = false, unique = true, length = 500) + @Column("content_url") val contentUrl: String, - @field:Column(name = "source_type", nullable = false) - @field:Enumerated(EnumType.STRING) + @Column("source_type") val sourceType: SourceType ) diff --git a/src/main/kotlin/com/yourssu/search/crawling/repository/CoroutineInformationRepository.kt b/src/main/kotlin/com/yourssu/search/crawling/repository/CoroutineInformationRepository.kt new file mode 100644 index 0000000..19fe680 --- /dev/null +++ b/src/main/kotlin/com/yourssu/search/crawling/repository/CoroutineInformationRepository.kt @@ -0,0 +1,6 @@ +package com.yourssu.search.crawling.repository + +import com.yourssu.search.crawling.domain.Information +import org.springframework.data.elasticsearch.repository.CoroutineElasticsearchRepository + +interface CoroutineInformationRepository : CoroutineElasticsearchRepository diff --git a/src/main/kotlin/com/yourssu/search/crawling/repository/InformationUrlRepository.kt b/src/main/kotlin/com/yourssu/search/crawling/repository/InformationUrlRepository.kt index fb44dfe..d15f0ce 100644 --- a/src/main/kotlin/com/yourssu/search/crawling/repository/InformationUrlRepository.kt +++ b/src/main/kotlin/com/yourssu/search/crawling/repository/InformationUrlRepository.kt @@ -2,10 +2,11 @@ package com.yourssu.search.crawling.repository import com.yourssu.search.crawling.domain.InformationUrl import com.yourssu.search.crawling.domain.SourceType -import org.springframework.data.jpa.repository.JpaRepository +import kotlinx.coroutines.flow.Flow +import org.springframework.data.repository.kotlin.CoroutineCrudRepository import org.springframework.stereotype.Repository @Repository -interface InformationUrlRepository : JpaRepository { - fun findAllBySourceType(sourceType: SourceType): List +interface InformationUrlRepository : CoroutineCrudRepository { + fun findAllBySourceType(sourceType: SourceType): Flow } diff --git a/src/main/kotlin/com/yourssu/search/crawling/scheduler/CrawlingScheduler.kt b/src/main/kotlin/com/yourssu/search/crawling/scheduler/CrawlingScheduler.kt new file mode 100644 index 0000000..f4a2ff2 --- /dev/null +++ b/src/main/kotlin/com/yourssu/search/crawling/scheduler/CrawlingScheduler.kt @@ -0,0 +1,18 @@ +package com.yourssu.search.crawling.scheduler + +import com.yourssu.search.crawling.service.CrawlingService +import kotlinx.coroutines.runBlocking +import org.springframework.scheduling.annotation.Scheduled +import org.springframework.stereotype.Component + +@Component +class CrawlingScheduler( + private val crawlingService: CrawlingService +) { + @Scheduled(cron = "0 0 0 * * ?") // 매일 자정 12시 실행 + fun scheduleCrawling() = runBlocking { + crawlingService.strategies.keys.forEach { key -> // strategies의 의존성 주입 된 객체 순회 + crawlingService.executeCrawling(key) + } + } +} diff --git a/src/main/kotlin/com/yourssu/search/crawling/service/CrawlingService.kt b/src/main/kotlin/com/yourssu/search/crawling/service/CrawlingService.kt index 05c7fd0..caa13ea 100644 --- a/src/main/kotlin/com/yourssu/search/crawling/service/CrawlingService.kt +++ b/src/main/kotlin/com/yourssu/search/crawling/service/CrawlingService.kt @@ -4,11 +4,12 @@ import com.yourssu.search.crawling.repository.InformationRepository import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.withContext import org.springframework.stereotype.Service +import org.springframework.transaction.annotation.Transactional @Service class CrawlingService( - private val strategies: Map, - private val informationRepository: InformationRepository, + public val strategies: Map, + private val informationRepository: InformationRepository ) { suspend fun executeCrawling(strategyKey: String) { @@ -17,6 +18,7 @@ class CrawlingService( strategy.crawl() } + @Transactional suspend fun deleteData() { withContext(Dispatchers.IO) { informationRepository.deleteAll() diff --git a/src/main/kotlin/com/yourssu/search/crawling/service/CrawlingStrategy.kt b/src/main/kotlin/com/yourssu/search/crawling/service/CrawlingStrategy.kt index 5afa130..3f36e11 100644 --- a/src/main/kotlin/com/yourssu/search/crawling/service/CrawlingStrategy.kt +++ b/src/main/kotlin/com/yourssu/search/crawling/service/CrawlingStrategy.kt @@ -2,4 +2,4 @@ package com.yourssu.search.crawling.service interface CrawlingStrategy { suspend fun crawl() -} \ No newline at end of file +} diff --git a/src/main/kotlin/com/yourssu/search/crawling/service/strategy/FunCrawlingStrategy.kt b/src/main/kotlin/com/yourssu/search/crawling/service/strategy/FunCrawlingStrategy.kt index 1e89c7e..08b712b 100644 --- a/src/main/kotlin/com/yourssu/search/crawling/service/strategy/FunCrawlingStrategy.kt +++ b/src/main/kotlin/com/yourssu/search/crawling/service/strategy/FunCrawlingStrategy.kt @@ -11,7 +11,7 @@ import kotlin.time.measureTimedValue @Component("fun") class FunCrawlingStrategy( private val crawlingUtils: CrawlingUtils -): CrawlingStrategy { +) : CrawlingStrategy { private val log = LoggerFactory.getLogger(this::class.java) override suspend fun crawl() { @@ -25,8 +25,10 @@ class FunCrawlingStrategy( val toSaveDocuments: List = crawlingUtils.filteringToSaveDocuments(allDocuments, SourceType.FUN, urlSelector) + val flattenedDocuments: List = allDocuments.flatten() crawlingUtils.crawlingContents( + // toSaveDocuments = flattenedDocuments, toSaveDocuments = toSaveDocuments, titleSelector = ".content .title", contentSelector = "div .description p", @@ -38,4 +40,4 @@ class FunCrawlingStrategy( } log.info("all time use {}", duration.duration.inWholeSeconds) } -} \ No newline at end of file +} diff --git a/src/main/kotlin/com/yourssu/search/crawling/service/strategy/NoticeCrawlingStrategy.kt b/src/main/kotlin/com/yourssu/search/crawling/service/strategy/NoticeCrawlingStrategy.kt index 86a8304..e11adf2 100644 --- a/src/main/kotlin/com/yourssu/search/crawling/service/strategy/NoticeCrawlingStrategy.kt +++ b/src/main/kotlin/com/yourssu/search/crawling/service/strategy/NoticeCrawlingStrategy.kt @@ -11,7 +11,7 @@ import kotlin.time.measureTimedValue @Component("notice") class NoticeCrawlingStrategy( private val crawlingUtils: CrawlingUtils -): CrawlingStrategy { +) : CrawlingStrategy { private val log = LoggerFactory.getLogger(this::class.java) override suspend fun crawl() { @@ -25,12 +25,12 @@ class NoticeCrawlingStrategy( val toSaveDocuments: List = crawlingUtils.filteringToSaveDocuments( - lists = allDocuments, - sourceType = SourceType.NOTICE, - urlSelector = urlSelector + lists = allDocuments, sourceType = SourceType.NOTICE, urlSelector = urlSelector ) + val flattenedDocuments: List = allDocuments.flatten() crawlingUtils.crawlingContents( + // toSaveDocuments = flattenedDocuments, toSaveDocuments = toSaveDocuments, titleSelector = ".notice_col3 a .d-inline-blcok.m-pt-5", contentSelector = "div.bg-white p", @@ -42,4 +42,4 @@ class NoticeCrawlingStrategy( } log.info("all time use {}", duration.duration.inWholeSeconds) } -} \ No newline at end of file +} diff --git a/src/main/kotlin/com/yourssu/search/crawling/utils/CrawlingUtils.kt b/src/main/kotlin/com/yourssu/search/crawling/utils/CrawlingUtils.kt index 7479468..fcbc4a7 100644 --- a/src/main/kotlin/com/yourssu/search/crawling/utils/CrawlingUtils.kt +++ b/src/main/kotlin/com/yourssu/search/crawling/utils/CrawlingUtils.kt @@ -3,13 +3,14 @@ package com.yourssu.search.crawling.utils import com.yourssu.search.crawling.domain.Information import com.yourssu.search.crawling.domain.InformationUrl import com.yourssu.search.crawling.domain.SourceType -import com.yourssu.search.crawling.repository.InformationRepository +import com.yourssu.search.crawling.repository.CoroutineInformationRepository import com.yourssu.search.crawling.repository.InformationUrlRepository import kotlinx.coroutines.CoroutineScope import kotlinx.coroutines.Dispatchers import kotlinx.coroutines.Job import kotlinx.coroutines.channels.Channel import kotlinx.coroutines.coroutineScope +import kotlinx.coroutines.flow.toList import kotlinx.coroutines.joinAll import kotlinx.coroutines.launch import kotlinx.coroutines.withContext @@ -18,6 +19,7 @@ import org.jsoup.nodes.Element import org.slf4j.LoggerFactory import org.springframework.beans.factory.annotation.Value import org.springframework.stereotype.Component +import org.springframework.transaction.annotation.Transactional import java.io.FileNotFoundException import java.time.LocalDate import java.util.concurrent.atomic.AtomicBoolean @@ -26,13 +28,14 @@ import java.util.regex.Pattern @Component class CrawlingUtils( - private val informationRepository: InformationRepository, + private val coroutineElasticsearchRepository: CoroutineInformationRepository, private val informationUrlRepository: InformationUrlRepository, @Value("\${general.user-agent}") private val userAgent: String, private val coroutineScope: CoroutineScope = CoroutineScope(Dispatchers.IO), + private val coroutineInformationRepository: CoroutineInformationRepository ) { private val log = LoggerFactory.getLogger(this::class.java) @@ -43,7 +46,7 @@ class CrawlingUtils( ): List { val savedData: List withContext(Dispatchers.IO) { - savedData = informationUrlRepository.findAllBySourceType(sourceType) + savedData = informationUrlRepository.findAllBySourceType(sourceType).toList() } val savedUrls = savedData.map { it.contentUrl } @@ -97,7 +100,6 @@ class CrawlingUtils( resultList } - private suspend fun fetchPage(baseUrl: String, pageNumber: Int, ulSelector: String): List { val document = Jsoup.connect("$baseUrl/$pageNumber") .userAgent(userAgent) @@ -117,6 +119,7 @@ class CrawlingUtils( return contents } + @Transactional suspend fun crawlingContents( toSaveDocuments: List, titleSelector: String, @@ -126,8 +129,6 @@ class CrawlingUtils( favicon: String?, sourceType: SourceType ) { - val urlChannel = Channel(Channel.UNLIMITED) - val contentJobs: List = toSaveDocuments.map { element -> coroutineScope.launch { val rawDate = element.selectFirst(dateSelector)?.text() ?: "" @@ -151,32 +152,37 @@ class CrawlingUtils( return@launch } - urlChannel.send(InformationUrl(contentUrl = contentUrl, sourceType = sourceType)) - - informationRepository.save( - Information( - title = title, - content = content.toString().trim(), - date = extractedDate, - contentUrl = contentUrl, - imgList = imgList, - favicon = favicon, - source = sourceType.value + try { + // `InformationUrl`을 즉시 저장 + informationUrlRepository.save( + InformationUrl( + contentUrl = contentUrl, + sourceType = sourceType + ) + ) + log.info("Saved URL: $contentUrl") + + // `Information`도 저장 + coroutineInformationRepository.save( + Information( + title = title, + content = content.toString().trim(), + date = extractedDate, + contentUrl = contentUrl, + imgList = imgList, + favicon = favicon, + source = sourceType.value + ) ) - ) + // log.info("Saved Information for URL: $contentUrl") + } catch (e: Exception) { + // log.error("Error saving URL or Information for $contentUrl", e) + } } } contentJobs.joinAll() - urlChannel.close() - - val toSaveUrls = mutableListOf() - for (url in urlChannel) { - toSaveUrls.add(url) - } - - val distinctUrls = toSaveUrls.distinctBy { it.contentUrl } - informationUrlRepository.saveAll(distinctUrls) + log.info("Crawling and saving completed.") } private fun extractDate(dateStr: String): LocalDate? { @@ -197,4 +203,16 @@ class CrawlingUtils( null // 정규표현식에 맞지 않으면 null 반환 } } -} \ No newline at end of file + + /*@Transactional + suspend fun saveAllWithRollback(urls: List) { + urls.forEachIndexed { index, url -> + informationUrlRepository.save(url) + + // 인위적으로 예외 발생 + if (index == 90) { + throw RuntimeException("Simulated exception for rollback") + } + } + }*/ +} diff --git a/src/main/resources/application.yaml b/src/main/resources/application.yaml index 66470fd..b1932fc 100644 --- a/src/main/resources/application.yaml +++ b/src/main/resources/application.yaml @@ -14,20 +14,28 @@ springdoc: config-url: /v3/api-docs/swagger-config url: /v3/api-docs +#spring: +# datasource: +# driver-class-name: com.mysql.cj.jdbc.Driver +# url: ${DATASOURCE_URL} +# password: ${DATASOURCE_PASSWORD} +# username: ${DATASOURCE_USERNAME} spring: - datasource: - driver-class-name: com.mysql.cj.jdbc.Driver + r2dbc: url: ${DATASOURCE_URL} password: ${DATASOURCE_PASSWORD} username: ${DATASOURCE_USERNAME} - jpa: + r2dbc.pool: + enabled: true + +# jpa: # profiles: # active: local # group: # local: logging-type-condole, logging-level-local # dev: logging-type-file, logging-level-dev - hibernate: - ddl-auto: update - properties: - hibernate: - dialect: org.hibernate.dialect.MySQL8Dialect +# hibernate: +# ddl-auto: update +# properties: +# hibernate: +# dialect: org.hibernate.dialect.MySQL8Dialect diff --git a/src/test/kotlin/com/yourssu/search/laboratory/ConcurrencyTest.kt b/src/test/kotlin/com/yourssu/search/laboratory/ConcurrencyTest.kt index 418e31b..1e1cf1c 100644 --- a/src/test/kotlin/com/yourssu/search/laboratory/ConcurrencyTest.kt +++ b/src/test/kotlin/com/yourssu/search/laboratory/ConcurrencyTest.kt @@ -65,7 +65,7 @@ class ConcurrencyTest { senderJobs.joinAll() channel.close() - + for (i in channel) { newUrls.add(i) } @@ -100,18 +100,19 @@ class ConcurrencyTest { println("Round $i - Mutex: ${mutexTime}ms, Channel: ${channelTime}ms") } } - + val mutexAvg = mutexTimes.average() val mutexMin = mutexTimes.min() val mutexMax = mutexTimes.max() - + val channelAvg = channelTimes.average() val channelMin = channelTimes.min() val channelMax = channelTimes.max() val improvement = ((mutexAvg - channelAvg) / mutexAvg) * 100.0 - println(""" + println( + """ 병렬 실행 성능 비교: Mutex - 평균: ${mutexAvg}ms, @@ -124,6 +125,7 @@ class ConcurrencyTest { - 최대: ${channelMax}ms 평균 개선량: ${"%.2f".format(improvement)}% - """.trimIndent()) + """.trimIndent() + ) } -} \ No newline at end of file +} diff --git a/src/test/kotlin/com/yourssu/search/repository/InformationUrlRepositoryTest.kt b/src/test/kotlin/com/yourssu/search/repository/InformationUrlRepositoryTest.kt new file mode 100644 index 0000000..025089c --- /dev/null +++ b/src/test/kotlin/com/yourssu/search/repository/InformationUrlRepositoryTest.kt @@ -0,0 +1,73 @@ +package com.yourssu.search.repository + +import com.yourssu.search.crawling.domain.InformationUrl +import com.yourssu.search.crawling.domain.SourceType +import com.yourssu.search.crawling.repository.InformationUrlRepository +import com.yourssu.search.crawling.utils.CrawlingUtils +import kotlinx.coroutines.Dispatchers +import kotlinx.coroutines.Job +import kotlinx.coroutines.flow.toList +import kotlinx.coroutines.joinAll +import kotlinx.coroutines.launch +import kotlinx.coroutines.runBlocking +import kotlinx.coroutines.test.runTest +import org.assertj.core.api.Assertions.assertThat +import org.hibernate.validator.internal.util.Contracts.assertNotNull +import org.junit.jupiter.api.Assertions.assertEquals +import org.junit.jupiter.api.Test +import org.springframework.beans.factory.annotation.Autowired +import org.springframework.boot.test.context.SpringBootTest + +@SpringBootTest +class InformationUrlRepositoryTest { + + @Autowired + private lateinit var repository: InformationUrlRepository + + @Autowired + private lateinit var service: CrawlingUtils + + @Test + fun `save information`() = runTest { + // Arrange: 테스트용 데이터 준비 + val info = InformationUrl( + contentUrl = "http://example.com", + sourceType = SourceType.NOTICE + ) + + val savedInfo = repository.save(info) + + assertNotNull(savedInfo.id, "ID should not be null after saving") + assert(savedInfo.contentUrl == info.contentUrl) + assert(savedInfo.sourceType == info.sourceType) + } + + @Test + fun `find by sourceType`() = runBlocking { + val noticeUrls = repository.findAllBySourceType(SourceType.FUN).toList() + + println("Found ${noticeUrls.size} URLs for SourceType.NOTICE") + } + + /*@Test + fun `test transactional rollback`() = runTest { + // Arrange: 테스트 데이터 준비 + val urls = (1..100).map { + InformationUrl( + contentUrl = "http://example.com/$it", + sourceType = if (it % 10 == 0) SourceType.FUN else SourceType.NOTICE + ) + } + + try { + service.saveAllWithRollback(urls) + } catch (ex: RuntimeException) { + println("Exception occurred: ${ex.message}") + } + + // Assert: 데이터베이스에 남아 있는 URL 확인 + val remainingUrls = repository.findAll().toList() + println("Remaining URLs: ${remainingUrls.size}") + assertEquals(0, remainingUrls.size, "All changes should have been rolled back") + }*/ +}